commit 7fa69b1354b71130f503f9cf826e5b03cf51911b Author: Gahow Wang Date: Fri Apr 10 13:22:19 2026 +0000 Initial project scaffold diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..832c7f7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,26 @@ +.venv/ +__pycache__/ +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.coverage +htmlcov/ +.nox/ +.tox/ +*.pyc +*.pyo +*.so +*.o +*.out +*.log +build/ +dist/ +*.egg-info/ +compile_commands.json +cmake-build-*/ +.cache/ +.idea/ +.vscode/ +.DS_Store +results/ +profile-output/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..725fbeb --- /dev/null +++ b/Makefile @@ -0,0 +1,23 @@ +PYTHON ?= python +KERNEL_LAB_CUDA_ARCH ?= 120 + +.PHONY: sync env info test bench build-cuda-ext + +sync: + uv sync + +env: + $(PYTHON) tools/check_env.py + +info: + $(PYTHON) tools/print_device_info.py + +test: + pytest -q + +bench: + ./tools/run_all_benchmarks.sh + +build-cuda-ext: + cmake -S kernels/cuda -B build/cuda -DCMAKE_CUDA_ARCHITECTURES=$(KERNEL_LAB_CUDA_ARCH) + cmake --build build/cuda -j diff --git a/README.md b/README.md new file mode 100644 index 0000000..1a59726 --- /dev/null +++ b/README.md @@ -0,0 +1,173 @@ +# kernel-lab + +`kernel-lab` is a learning-first GPU kernel workbook for studying the same operator across four layers: + +1. PyTorch reference code +2. Triton kernels +3. Native CUDA C++ kernels +4. PyTorch custom operator integration + +The repository is intentionally not a finished kernel library. The core Triton and CUDA implementations are left as TODO-driven lab exercises so you can study indexing, reductions, tiling, memory movement, correctness checks, and profiling in a controlled way. + +## Why This Repo Exists + +This lab is aimed at a modern NVIDIA workflow with a Blackwell-class consumer GPU such as an RTX 5090. The exercises themselves are mostly architecture-generic, so the project name stays broad while the build and docs keep hardware targeting explicit. + +Each operator exists for a reason: + +- `vector_add`: launch geometry, indexing, bounds checks +- `row_softmax`: reductions, numerical stability, bandwidth limits +- `tiled_matmul`: tiling, data reuse, memory hierarchy +- `online_softmax`: running max / running sum recurrence +- `flash_attention_fwd`: blockwise attention, masking, online normalization +- `pytorch_custom_op`: how kernels get surfaced as framework operators +- `profiling`: how to measure what actually happened + +## Learning Roadmap + +Start with the environment sanity task, then implement kernels in this order: + +1. `tasks/00_env_sanity` +2. `tasks/01_vector_add` +3. `tasks/02_row_softmax` +4. `tasks/03_tiled_matmul` +5. `tasks/04_online_softmax` +6. `tasks/05_flash_attention_fwd` +7. `tasks/06_pytorch_custom_op` +8. `tasks/07_profiling` + +The detailed week-1 plan and implementation order live in `docs/roadmap.md`. + +## Triton To CUDA Mapping + +The core mental model is: + +- Triton `program_id` maps to CUDA block-level work assignment +- Triton blocked tensor operations map to manual thread/block index arithmetic +- Triton masks map to explicit boundary checks in CUDA +- Triton load/store helpers abstract pointer math that CUDA exposes directly +- Triton hides synchronization details that CUDA requires you to reason about + +See `docs/triton_vs_cuda.md` for a longer concept table. + +## Repository Layout + +```text +docs/ concept notes, roadmap, profiling guidance +reference/ plain PyTorch reference implementations +kernels/ Triton and CUDA learner skeletons +tasks/ workbook specs, TODO skeletons, task-local tests and benches +tests/ repository-wide checks and correctness scaffolding +bench/ cross-implementation benchmark harnesses +tools/ environment checks, profiling helpers, comparison scripts +``` + +## Environment Assumptions + +- Python 3.10+ +- PyTorch with CUDA support +- Triton installed if you want to run Triton tasks +- CUDA toolkit installed if you want to build the native extension +- A recent NVIDIA driver and a Blackwell-capable software stack + +Architecture targeting is configurable: + +- `KERNEL_LAB_CUDA_ARCH=120` for Python extension loading helpers +- `-DCMAKE_CUDA_ARCHITECTURES=120` for direct CMake builds + +If your toolkit, driver, or local environment does not yet expose Blackwell exactly as expected, keep the architecture explicit and adjust it instead of editing kernel source files. + +## Install + +```bash +uv sync +``` + +If you want commands to run inside the uv-managed environment without activating it manually, use `uv run`, for example: + +```bash +uv run pytest -q +uv run python tools/check_env.py +``` + +## Run Environment Checks + +```bash +uv run python tools/check_env.py +uv run python tools/print_device_info.py +``` + +## Run Tests + +The default test suite validates references and scaffolding. Triton/CUDA task tests skip gracefully until you implement the learner TODOs. + +```bash +uv run pytest -q +``` + +You can also use: + +```bash +make sync +uv run pytest -q +uv run ./tools/run_all_tests.sh +``` + +## Run Benchmarks + +Benchmarks compare PyTorch, Triton, and CUDA when available. Incomplete implementations are reported and skipped. + +```bash +uv run python bench/bench_vector_add.py --device cuda +uv run python bench/bench_softmax.py --device cuda +uv run python bench/bench_matmul.py --device cuda +uv run python bench/bench_attention.py --device cuda +uv run python bench/compare_impls.py --task vector_add +``` + +Or run the helper: + +```bash +uv run ./tools/run_all_benchmarks.sh +``` + +## Build The CUDA Extension + +Two paths are provided: + +1. CMake-first native build: + +```bash +cmake -S kernels/cuda -B build/cuda -DCMAKE_CUDA_ARCHITECTURES=${KERNEL_LAB_CUDA_ARCH:-120} +cmake --build build/cuda -j +``` + +2. Python-driven extension loading for lab experiments: + +```bash +uv run python tasks/06_pytorch_custom_op/extension_skeleton.py +``` + +The binding and CUDA source files build a minimal extension skeleton. The learner is expected to fill in operator registration and kernel dispatch details. + +## Profile Kernels + +Start from one kernel, one shape, one implementation: + +```bash +uv run ./tools/profile_ncu.sh python bench/bench_vector_add.py --device cuda --mode cuda +uv run ./tools/profile_nsys.sh python bench/bench_attention.py --device cuda --mode triton +``` + +See `docs/profiling_guide.md` for warmup, synchronization, and first metrics to inspect. + +## How To Use The Workbook + +- Read the `spec.md` for the current task. +- Run the reference implementation and tests first. +- Read the Triton and CUDA skeleton side by side. +- Fill in one TODO at a time. +- Re-run correctness tests before looking at benchmark numbers. +- Only profile after the kernel is correct on small shapes. + +This repo is designed to make the learning path visible. The TODOs are the point. diff --git a/bench/bench_attention.py b/bench/bench_attention.py new file mode 100644 index 0000000..d8d188b --- /dev/null +++ b/bench/bench_attention.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +import argparse +import statistics +import sys +import time +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +import torch + +from kernels.triton.flash_attention_fwd import triton_flash_attention_fwd +from reference.torch_attention import torch_attention +from tools.lab_extension import build_extension + + +def benchmark(fn, *args, warmup: int = 5, reps: int = 20, **kwargs) -> float: + for _ in range(warmup): + fn(*args, **kwargs) + if args[0].is_cuda: + torch.cuda.synchronize() + times_ms = [] + for _ in range(reps): + if args[0].is_cuda: + torch.cuda.synchronize() + start = time.perf_counter() + fn(*args, **kwargs) + if args[0].is_cuda: + torch.cuda.synchronize() + times_ms.append((time.perf_counter() - start) * 1e3) + return statistics.median(times_ms) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu") + parser.add_argument("--mode", choices=["all", "torch", "triton", "cuda"], default="all") + parser.add_argument("--batch", type=int, default=2) + parser.add_argument("--heads", type=int, default=8) + parser.add_argument("--seq", type=int, default=128) + parser.add_argument("--dim", type=int, default=64) + parser.add_argument("--causal", action="store_true") + args = parser.parse_args() + + q = torch.randn(args.batch, args.heads, args.seq, args.dim, device=args.device) + k = torch.randn(args.batch, args.heads, args.seq, args.dim, device=args.device) + v = torch.randn(args.batch, args.heads, args.seq, args.dim, device=args.device) + + if args.mode in {"all", "torch"}: + elapsed_ms = benchmark(torch_attention, q, k, v, causal=args.causal) + print(f"torch: {elapsed_ms:.3f} ms") + + if args.device == "cuda" and args.mode in {"all", "triton"}: + try: + elapsed_ms = benchmark(triton_flash_attention_fwd, q, k, v, causal=args.causal) + print(f"triton: {elapsed_ms:.3f} ms") + except (NotImplementedError, RuntimeError) as exc: + print(f"triton: skipped ({exc})") + + if args.device == "cuda" and args.mode in {"all", "cuda"}: + ext = build_extension(verbose=False) + if ext is None or not hasattr(torch.ops, "kernel_lab"): + print("cuda: skipped (extension unavailable)") + else: + try: + elapsed_ms = benchmark( + torch.ops.kernel_lab.flash_attention_fwd, q, k, v, args.causal + ) + print(f"cuda: {elapsed_ms:.3f} ms") + except Exception as exc: + print(f"cuda: skipped ({exc})") + + +if __name__ == "__main__": + main() diff --git a/bench/bench_matmul.py b/bench/bench_matmul.py new file mode 100644 index 0000000..62445f7 --- /dev/null +++ b/bench/bench_matmul.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +import argparse +import statistics +import sys +import time +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +import torch + +from kernels.triton.tiled_matmul import triton_tiled_matmul +from reference.torch_matmul import torch_matmul +from tools.lab_extension import build_extension + + +def benchmark(fn, *args, warmup: int = 5, reps: int = 20) -> float: + for _ in range(warmup): + fn(*args) + if args[0].is_cuda: + torch.cuda.synchronize() + times_ms = [] + for _ in range(reps): + if args[0].is_cuda: + torch.cuda.synchronize() + start = time.perf_counter() + fn(*args) + if args[0].is_cuda: + torch.cuda.synchronize() + times_ms.append((time.perf_counter() - start) * 1e3) + return statistics.median(times_ms) + + +def report(name: str, elapsed_ms: float, m: int, n: int, k: int) -> None: + tflops = (2.0 * m * n * k) / (elapsed_ms * 1e-3) / 1e12 + print(f"{name}: {elapsed_ms:.3f} ms | throughput {tflops:.3f} TFLOP/s") + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu") + parser.add_argument("--mode", choices=["all", "torch", "triton", "cuda"], default="all") + parser.add_argument("--m", type=int, default=1024) + parser.add_argument("--n", type=int, default=1024) + parser.add_argument("--k", type=int, default=1024) + args = parser.parse_args() + + a = torch.randn(args.m, args.k, device=args.device) + b = torch.randn(args.k, args.n, device=args.device) + + if args.mode in {"all", "torch"}: + report("torch", benchmark(torch_matmul, a, b), args.m, args.n, args.k) + + if args.device == "cuda" and args.mode in {"all", "triton"}: + try: + report("triton", benchmark(triton_tiled_matmul, a, b), args.m, args.n, args.k) + except (NotImplementedError, RuntimeError) as exc: + print(f"triton: skipped ({exc})") + + if args.device == "cuda" and args.mode in {"all", "cuda"}: + ext = build_extension(verbose=False) + if ext is None or not hasattr(torch.ops, "kernel_lab"): + print("cuda: skipped (extension unavailable)") + else: + try: + report( + "cuda", + benchmark(torch.ops.kernel_lab.tiled_matmul, a, b), + args.m, + args.n, + args.k, + ) + except Exception as exc: + print(f"cuda: skipped ({exc})") + + +if __name__ == "__main__": + main() diff --git a/bench/bench_softmax.py b/bench/bench_softmax.py new file mode 100644 index 0000000..ab146bb --- /dev/null +++ b/bench/bench_softmax.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +import argparse +import statistics +import sys +import time +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +import torch + +from kernels.triton.online_softmax import triton_online_softmax +from kernels.triton.row_softmax import triton_row_softmax +from reference.torch_online_softmax import torch_online_softmax +from reference.torch_row_softmax import torch_row_softmax +from tools.lab_extension import build_extension + + +def benchmark(fn, *args, warmup: int = 5, reps: int = 25) -> float: + for _ in range(warmup): + fn(*args) + if args[0].is_cuda: + torch.cuda.synchronize() + times_ms = [] + for _ in range(reps): + if args[0].is_cuda: + torch.cuda.synchronize() + start = time.perf_counter() + fn(*args) + if args[0].is_cuda: + torch.cuda.synchronize() + times_ms.append((time.perf_counter() - start) * 1e3) + return statistics.median(times_ms) + + +def report(name: str, elapsed_ms: float, x: torch.Tensor) -> None: + logical_bytes = 3 * x.numel() * x.element_size() + gbps = logical_bytes / (elapsed_ms * 1e-3) / 1e9 + print(f"{name}: {elapsed_ms:.3f} ms | logical bandwidth {gbps:.2f} GB/s") + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu") + parser.add_argument("--mode", choices=["all", "torch", "triton", "cuda"], default="all") + parser.add_argument("--variant", choices=["row", "online"], default="row") + parser.add_argument("--rows", type=int, default=4096) + parser.add_argument("--cols", type=int, default=1024) + args = parser.parse_args() + + x = torch.randn(args.rows, args.cols, device=args.device) + + ref_fn = torch_row_softmax if args.variant == "row" else torch_online_softmax + triton_fn = triton_row_softmax if args.variant == "row" else triton_online_softmax + cuda_name = "row_softmax" if args.variant == "row" else "online_softmax" + + if args.mode in {"all", "torch"}: + report(f"torch_{args.variant}_softmax", benchmark(ref_fn, x), x) + + if args.device == "cuda" and args.mode in {"all", "triton"}: + try: + report(f"triton_{args.variant}_softmax", benchmark(triton_fn, x), x) + except (NotImplementedError, RuntimeError) as exc: + print(f"triton_{args.variant}_softmax: skipped ({exc})") + + if args.device == "cuda" and args.mode in {"all", "cuda"}: + ext = build_extension(verbose=False) + if ext is None or not hasattr(torch.ops, "kernel_lab"): + print(f"cuda_{args.variant}_softmax: skipped (extension unavailable)") + else: + try: + cuda_fn = getattr(torch.ops.kernel_lab, cuda_name) + report(f"cuda_{args.variant}_softmax", benchmark(cuda_fn, x), x) + except Exception as exc: + print(f"cuda_{args.variant}_softmax: skipped ({exc})") + + +if __name__ == "__main__": + main() diff --git a/bench/bench_vector_add.py b/bench/bench_vector_add.py new file mode 100644 index 0000000..5cb3dc9 --- /dev/null +++ b/bench/bench_vector_add.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +import argparse +import statistics +import sys +import time +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +import torch + +from kernels.triton.vector_add import triton_vector_add +from reference.torch_vector_add import torch_vector_add +from tools.lab_extension import build_extension + + +def benchmark(fn, *args, warmup: int = 5, reps: int = 30) -> float: + for _ in range(warmup): + fn(*args) + if args[0].is_cuda: + torch.cuda.synchronize() + times_ms = [] + for _ in range(reps): + if args[0].is_cuda: + torch.cuda.synchronize() + start = time.perf_counter() + fn(*args) + if args[0].is_cuda: + torch.cuda.synchronize() + times_ms.append((time.perf_counter() - start) * 1e3) + return statistics.median(times_ms) + + +def report(name: str, elapsed_ms: float, x: torch.Tensor) -> None: + bytes_moved = 3 * x.numel() * x.element_size() + gbps = bytes_moved / (elapsed_ms * 1e-3) / 1e9 + print(f"{name}: {elapsed_ms:.3f} ms | effective bandwidth {gbps:.2f} GB/s") + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu") + parser.add_argument("--mode", choices=["all", "torch", "triton", "cuda"], default="all") + parser.add_argument("--numel", type=int, default=1 << 24) + args = parser.parse_args() + + x = torch.randn(args.numel, device=args.device) + y = torch.randn(args.numel, device=args.device) + + if args.mode in {"all", "torch"}: + report("torch", benchmark(torch_vector_add, x, y), x) + + if args.device == "cuda" and args.mode in {"all", "triton"}: + try: + report("triton", benchmark(triton_vector_add, x, y), x) + except (NotImplementedError, RuntimeError) as exc: + print(f"triton: skipped ({exc})") + + if args.device == "cuda" and args.mode in {"all", "cuda"}: + ext = build_extension(verbose=False) + if ext is None or not hasattr(torch.ops, "kernel_lab"): + print("cuda: skipped (extension unavailable)") + else: + try: + report("cuda", benchmark(torch.ops.kernel_lab.vector_add, x, y), x) + except Exception as exc: + print(f"cuda: skipped ({exc})") + + +if __name__ == "__main__": + main() diff --git a/bench/compare_impls.py b/bench/compare_impls.py new file mode 100644 index 0000000..deb58db --- /dev/null +++ b/bench/compare_impls.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import argparse +import subprocess +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] + + +TASK_TO_SCRIPT = { + "vector_add": ROOT / "bench" / "bench_vector_add.py", + "softmax": ROOT / "bench" / "bench_softmax.py", + "matmul": ROOT / "bench" / "bench_matmul.py", + "attention": ROOT / "bench" / "bench_attention.py", +} + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--task", choices=sorted(TASK_TO_SCRIPT), required=True) + parser.add_argument("extra_args", nargs="*") + args = parser.parse_args() + + cmd = [sys.executable, str(TASK_TO_SCRIPT[args.task]), *args.extra_args] + subprocess.run(cmd, check=True) + + +if __name__ == "__main__": + main() + diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000..91b2417 --- /dev/null +++ b/conftest.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parent +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) diff --git a/docs/blackwell_notes.md b/docs/blackwell_notes.md new file mode 100644 index 0000000..647ee15 --- /dev/null +++ b/docs/blackwell_notes.md @@ -0,0 +1,20 @@ +# Blackwell Notes + +This repository targets a Blackwell-style workflow, but keeps the build configuration explicit because local toolchain support may differ across systems. + +## Build Guidance + +- Prefer explicit architecture selection over hidden defaults. +- Use `KERNEL_LAB_CUDA_ARCH=120` for Python-side build helpers when your local environment supports it. +- Use `-DCMAKE_CUDA_ARCHITECTURES=120` with CMake for direct native builds. +- If your toolkit does not yet accept the exact architecture value you want, adjust the build flag rather than editing the kernels. + +## What To Watch On A New GPU Generation + +- compiler support for the target architecture +- PyTorch wheel compatibility +- Triton support level +- driver/toolkit mismatch +- profiler tool compatibility + +Treat environment validation as part of the lab, not as a one-time setup nuisance. diff --git a/docs/cuda_execution_model.md b/docs/cuda_execution_model.md new file mode 100644 index 0000000..b29ea4c --- /dev/null +++ b/docs/cuda_execution_model.md @@ -0,0 +1,40 @@ +# CUDA Execution Model + +## How To Read A CUDA Kernel + +Use this short checklist every time: + +1. Find the logical work unit. + Ask what one thread, warp, or block is responsible for. +2. Decode the index math. + Look for `blockIdx`, `threadIdx`, `blockDim`, and any derived offsets. +3. Inspect the memory accesses. + Separate global loads, shared memory loads, stores, and reductions. +4. Find synchronization points. + Every `__syncthreads()` should protect a clear shared-memory phase boundary. +5. Check boundary conditions. + Out-of-range reads and stores are a common first bug. +6. Compare against the reference implementation. + Make sure the math, masking, and shape conventions still match. + +## Execution Hierarchy + +- Grid: all blocks launched for one kernel +- Block: a cooperating team of threads +- Thread: one scalar execution context + +CUDA makes several things explicit that Triton abstracts: + +- manual thread/block decomposition +- pointer arithmetic +- shared-memory allocation and reuse +- synchronization +- launch configuration choices + +## Reading Order For This Lab + +- `vector_add.cu`: pure indexing +- `row_softmax.cu`: reduction structure +- `tiled_matmul.cu`: shared-memory tiling +- `online_softmax.cu`: stateful reduction recurrence +- `flash_attention_fwd.cu`: composition of multiple ideas diff --git a/docs/flashattention_notes.md b/docs/flashattention_notes.md new file mode 100644 index 0000000..bf170f5 --- /dev/null +++ b/docs/flashattention_notes.md @@ -0,0 +1,28 @@ +# FlashAttention Notes + +FlashAttention-style kernels are useful because the naive attention pipeline materializes large score tensors and spends too much bandwidth moving them. + +## The Core Idea + +Instead of: + +1. computing the full score matrix +2. writing it out +3. running softmax +4. reading it back +5. multiplying by `V` + +you process attention block by block and keep more intermediate state on chip. + +## Why Online Softmax Matters + +Blockwise processing changes the normalization problem. You cannot assume you have seen the full row. The running max / running sum recurrence lets you update normalization state incrementally without losing numerical stability. + +## What This Lab Covers + +- forward pass only +- small-shape correctness first +- optional causal masking +- side-by-side Triton and CUDA skeletons + +This repo intentionally stops short of a polished production FlashAttention implementation. The point is to expose the algorithmic structure. diff --git a/docs/gpu_basics.md b/docs/gpu_basics.md new file mode 100644 index 0000000..6b4e57d --- /dev/null +++ b/docs/gpu_basics.md @@ -0,0 +1,30 @@ +# GPU Basics + +This lab assumes you are learning GPU kernels as structured data-parallel programs. + +## Core Ideas + +- GPU throughput comes from massive parallelism, not a single fast thread. +- Launch geometry determines which logical elements each thread or program instance owns. +- Global memory is large and slow relative to on-chip storage. +- Kernel design is often about reducing memory traffic and increasing reuse. + +## Terms To Keep Straight + +- thread: the smallest execution entity in CUDA +- warp: a hardware scheduling group, usually 32 threads +- block: a cooperating group of threads with shared memory access +- grid: the full launch of all blocks +- program instance: Triton's block-level work abstraction + +## Mental Model For This Repo + +Each task asks the same questions in both Triton and CUDA: + +- What data does one unit of work own? +- How is that ownership computed from launch indices? +- Which reads are coalesced or contiguous? +- Which intermediate values must be reduced? +- Which values should be reused on chip? + +Keep a notebook. Write down the answers before you code. diff --git a/docs/profiling_guide.md b/docs/profiling_guide.md new file mode 100644 index 0000000..3252dfb --- /dev/null +++ b/docs/profiling_guide.md @@ -0,0 +1,87 @@ +# Profiling Guide + +## Profile One Kernel At A Time + +Good profiling starts narrow: + +- one implementation +- one shape +- one dtype +- one device +- one command you can rerun + +If you profile a full training script too early, you will not know which kernel you are looking at. + +## Why Warmup Matters + +The first iterations may include: + +- lazy module loading +- JIT compilation +- cache effects +- allocator setup + +Warm up first, then measure. + +## Why Synchronization Matters + +GPU work is asynchronous with respect to Python. If you do not synchronize before stopping a timer, you usually measure launch overhead instead of kernel runtime. + +Use `torch.cuda.synchronize()` around timed regions. + +## How To Avoid Misleading Timings + +- keep shapes fixed +- use multiple repetitions +- report median, not only minimum +- separate correctness from performance testing +- compare implementations under the same dtype and device conditions +- check that all inputs are already on the GPU + +## First Metrics To Inspect + +- kernel duration +- achieved memory throughput +- occupancy +- DRAM transactions or bandwidth +- shared-memory throughput when tiling is relevant +- eligible warps per cycle when investigating latency hiding + +## Practical `ncu` Examples + +```bash +ncu --set full --target-processes all \ + python bench/bench_vector_add.py --device cuda --mode cuda +``` + +```bash +ncu --metrics sm__throughput.avg.pct_of_peak_sustained_elapsed,\ +dram__throughput.avg.pct_of_peak_sustained_elapsed \ + python bench/bench_softmax.py --device cuda --mode triton +``` + +## Practical `nsys` Examples + +```bash +nsys profile --trace=cuda,nvtx,osrt --sample=none \ + -o profile-output/attention_triton \ + python bench/bench_attention.py --device cuda --mode triton +``` + +```bash +nsys profile --trace=cuda,nvtx,osrt --sample=none \ + -o profile-output/matmul_cuda \ + python bench/bench_matmul.py --device cuda --mode cuda +``` + +## Checklist Before Trusting A Benchmark Result + +- Was there a warmup phase? +- Was the device synchronized before and after timing? +- Did all implementations run the same math? +- Were outputs checked against a reference? +- Were shapes and dtypes identical? +- Was one implementation silently skipped or falling back to CPU? +- Did you report median time over several repetitions? +- Is the measured quantity bandwidth-bound or compute-bound? +- Did you accidentally include setup or compilation time? diff --git a/docs/roadmap.md b/docs/roadmap.md new file mode 100644 index 0000000..53716d7 --- /dev/null +++ b/docs/roadmap.md @@ -0,0 +1,75 @@ +# Roadmap + +## Week 1 Study Plan + +Day 1: + +- Run `tools/check_env.py` +- Read `docs/gpu_basics.md` +- Read `docs/cuda_execution_model.md` +- Inspect `reference/torch_vector_add.py` +- Implement or partially implement `tasks/01_vector_add/triton_skeleton.py` + +Day 2: + +- Read `docs/triton_vs_cuda.md` +- Inspect `kernels/cuda/src/vector_add.cu` +- Fill in vector add indexing TODOs in Triton and CUDA +- Run `pytest -q tasks/01_vector_add/test_task.py` + +Day 3: + +- Read `reference/torch_row_softmax.py` +- Read `tasks/02_row_softmax/spec.md` +- Implement numerically stable row softmax in Triton first +- Compare against the CUDA skeleton and map the reduction strategy + +Day 4: + +- Study `tasks/03_tiled_matmul/spec.md` +- Draw the tile decomposition on paper +- Implement one matmul tile path with correctness-only priorities + +Day 5: + +- Read `docs/flashattention_notes.md` +- Read `tasks/04_online_softmax/spec.md` +- Derive the running max / running sum recurrence informally + +Day 6: + +- Inspect `tasks/05_flash_attention_fwd/spec.md` +- Trace the PyTorch reference line by line +- Annotate where Q/K/V loads, score computation, normalization, and output accumulation happen + +Day 7: + +- Read `docs/profiling_guide.md` +- Run one benchmark and one profiler command +- Write down which numbers changed after warmup and synchronization + +## Recommended TODO Order + +1. Environment checks +2. Vector add Triton +3. Vector add CUDA +4. Row softmax Triton +5. Row softmax CUDA +6. Tiled matmul Triton +7. Tiled matmul CUDA +8. Online softmax Triton +9. Online softmax CUDA +10. Flash attention forward Triton +11. Flash attention forward CUDA +12. PyTorch custom op binding +13. Profiling passes and benchmark validation + +## What To Focus On First + +- Correctness on tiny shapes +- Clear index math +- Explicit shape assumptions +- Numerically stable reductions +- Repeatable measurement + +Do not chase peak performance before you can explain the memory traffic and launch geometry of your kernel. diff --git a/docs/triton_vs_cuda.md b/docs/triton_vs_cuda.md new file mode 100644 index 0000000..722254e --- /dev/null +++ b/docs/triton_vs_cuda.md @@ -0,0 +1,30 @@ +# Triton Vs CUDA + +## Concept Mapping Table + +| Triton concept | CUDA concept | What to notice | +| --- | --- | --- | +| `tl.program_id(axis=0)` | `blockIdx.x` and block ownership | Both assign a chunk of logical work to a block-scale unit | +| `tl.arange(0, BLOCK)` | `threadIdx.x` or manual lane-local offsets | Triton expresses vectors of indices directly | +| masked `tl.load` / `tl.store` | explicit `if (idx < n)` checks | Same boundary problem, different syntax | +| blocked tensor operations | thread/block decomposition plus loops | Triton lifts index sets into tensor expressions | +| pointer arithmetic in element units | byte-addressed pointer math and indexing | CUDA makes layout mechanics more visible | +| implicit vectorized math | manual scalar or vector intrinsics | Triton often reads like array algebra | +| autotuned launch parameters | manual block-size tuning | Both still depend on the memory hierarchy | +| block pointers and tile views | shared memory tiles and cooperative loads | The same reuse idea shows up with different APIs | +| reduction combinators | warp/block reductions | Same algorithmic structure, different implementation burden | +| masks and predicates | control flow and bounds checks | Divergence and predication still matter | + +## How To Compare Side By Side + +1. Start from the reference PyTorch function and identify the mathematical operator. +2. In the Triton version, ask what one program instance owns. +3. In the CUDA version, ask what one block and one thread own. +4. Match the memory reads and writes, not just the variable names. +5. Write down where reduction state lives in each version. +6. For tiled code, identify when data moves from global memory to on-chip storage. +7. Only then compare performance. + +## Rule Of Thumb + +Triton usually compresses the "how" so you can focus on the blocked tensor math. CUDA exposes the "how" directly, which is why it is valuable to study both. diff --git a/kernels/__init__.py b/kernels/__init__.py new file mode 100644 index 0000000..86d2a32 --- /dev/null +++ b/kernels/__init__.py @@ -0,0 +1,2 @@ +"""Kernel modules for Triton and CUDA learning tasks.""" + diff --git a/kernels/cuda/CMakeLists.txt b/kernels/cuda/CMakeLists.txt new file mode 100644 index 0000000..a3d42a8 --- /dev/null +++ b/kernels/cuda/CMakeLists.txt @@ -0,0 +1,29 @@ +cmake_minimum_required(VERSION 3.25) +project(kernel_lab LANGUAGES CXX CUDA) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CUDA_STANDARD 17) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES 120 CACHE STRING "Target CUDA architectures") +endif() + +find_package(Torch REQUIRED) + +add_library(kernel_lab_extension SHARED + binding/binding.cpp + src/vector_add.cu + src/row_softmax.cu + src/tiled_matmul.cu + src/online_softmax.cu + src/flash_attention_fwd.cu +) + +target_include_directories(kernel_lab_extension PRIVATE include) +target_link_libraries(kernel_lab_extension PRIVATE "${TORCH_LIBRARIES}") +target_compile_features(kernel_lab_extension PRIVATE cxx_std_17) +set_target_properties(kernel_lab_extension PROPERTIES + PREFIX "" + CUDA_SEPARABLE_COMPILATION ON +) diff --git a/kernels/cuda/binding/binding.cpp b/kernels/cuda/binding/binding.cpp new file mode 100644 index 0000000..abfd7f8 --- /dev/null +++ b/kernels/cuda/binding/binding.cpp @@ -0,0 +1,69 @@ +#include "../include/common.h" +#include "../include/cuda_utils.h" + +#include + +namespace kernel_lab { + +torch::Tensor vector_add_dispatch(torch::Tensor x, torch::Tensor y) { + check_cuda_pair(x, y); + LAB_CHECK_SAME_SHAPE(x, y); + return vector_add_cuda(x, y); +} + +torch::Tensor row_softmax_dispatch(torch::Tensor x) { + LAB_CHECK_CUDA(x); + LAB_CHECK_CONTIGUOUS(x); + return row_softmax_cuda(x); +} + +torch::Tensor tiled_matmul_dispatch(torch::Tensor a, torch::Tensor b) { + check_cuda_pair(a, b); + return tiled_matmul_cuda(a, b); +} + +torch::Tensor online_softmax_dispatch(torch::Tensor x) { + LAB_CHECK_CUDA(x); + LAB_CHECK_CONTIGUOUS(x); + return online_softmax_cuda(x); +} + +torch::Tensor flash_attention_fwd_dispatch( + torch::Tensor q, + torch::Tensor k, + torch::Tensor v, + bool causal) { + LAB_CHECK_CUDA(q); + LAB_CHECK_CUDA(k); + LAB_CHECK_CUDA(v); + return flash_attention_fwd_cuda(q, k, v, causal); +} + +} // namespace kernel_lab + +TORCH_LIBRARY(kernel_lab, m) { + m.def("vector_add(Tensor x, Tensor y) -> Tensor"); + m.def("row_softmax(Tensor x) -> Tensor"); + m.def("tiled_matmul(Tensor a, Tensor b) -> Tensor"); + m.def("online_softmax(Tensor x) -> Tensor"); + m.def("flash_attention_fwd(Tensor q, Tensor k, Tensor v, bool causal=False) -> Tensor"); +} + +TORCH_LIBRARY_IMPL(kernel_lab, CUDA, m) { + m.impl("vector_add", &kernel_lab::vector_add_dispatch); + m.impl("row_softmax", &kernel_lab::row_softmax_dispatch); + m.impl("tiled_matmul", &kernel_lab::tiled_matmul_dispatch); + m.impl("online_softmax", &kernel_lab::online_softmax_dispatch); + m.impl("flash_attention_fwd", &kernel_lab::flash_attention_fwd_dispatch); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("vector_add_dispatch", &kernel_lab::vector_add_dispatch, "Vector add dispatch"); + m.def("row_softmax_dispatch", &kernel_lab::row_softmax_dispatch, "Row softmax dispatch"); + m.def("tiled_matmul_dispatch", &kernel_lab::tiled_matmul_dispatch, "Tiled matmul dispatch"); + m.def("online_softmax_dispatch", &kernel_lab::online_softmax_dispatch, "Online softmax dispatch"); + m.def( + "flash_attention_fwd_dispatch", + &kernel_lab::flash_attention_fwd_dispatch, + "Flash attention forward dispatch"); +} diff --git a/kernels/cuda/include/common.h b/kernels/cuda/include/common.h new file mode 100644 index 0000000..e8b58db --- /dev/null +++ b/kernels/cuda/include/common.h @@ -0,0 +1,17 @@ +#pragma once + +#include + +namespace kernel_lab { + +torch::Tensor vector_add_cuda(torch::Tensor x, torch::Tensor y); +torch::Tensor row_softmax_cuda(torch::Tensor x); +torch::Tensor tiled_matmul_cuda(torch::Tensor a, torch::Tensor b); +torch::Tensor online_softmax_cuda(torch::Tensor x); +torch::Tensor flash_attention_fwd_cuda( + torch::Tensor q, + torch::Tensor k, + torch::Tensor v, + bool causal); + +} // namespace kernel_lab diff --git a/kernels/cuda/include/cuda_utils.h b/kernels/cuda/include/cuda_utils.h new file mode 100644 index 0000000..0b48677 --- /dev/null +++ b/kernels/cuda/include/cuda_utils.h @@ -0,0 +1,15 @@ +#pragma once + +#include + +#define LAB_CHECK_CUDA(x) TORCH_CHECK((x).is_cuda(), #x " must be a CUDA tensor") +#define LAB_CHECK_CONTIGUOUS(x) TORCH_CHECK((x).is_contiguous(), #x " must be contiguous") +#define LAB_CHECK_SAME_SHAPE(x, y) TORCH_CHECK((x).sizes() == (y).sizes(), #x " and " #y " must have the same shape") + +inline void check_cuda_pair(const torch::Tensor& x, const torch::Tensor& y) { + LAB_CHECK_CUDA(x); + LAB_CHECK_CUDA(y); + LAB_CHECK_CONTIGUOUS(x); + LAB_CHECK_CONTIGUOUS(y); +} + diff --git a/kernels/cuda/src/flash_attention_fwd.cu b/kernels/cuda/src/flash_attention_fwd.cu new file mode 100644 index 0000000..1af35dd --- /dev/null +++ b/kernels/cuda/src/flash_attention_fwd.cu @@ -0,0 +1,54 @@ +#include "../include/common.h" +#include "../include/cuda_utils.h" + +namespace kernel_lab { + +__global__ void flash_attention_fwd_kernel( + const float* q, + const float* k, + const float* v, + float* out, + int64_t batch, + int64_t heads, + int64_t seq_len, + int64_t head_dim, + bool causal) { + (void)q; + (void)k; + (void)v; + (void)out; + (void)batch; + (void)heads; + (void)seq_len; + (void)head_dim; + (void)causal; + + // TODO(student): assign each block to a batch/head/query tile. + // TODO(student): cooperatively load K/V tiles. + // TODO(student): compute score blocks and apply causal masking when requested. + // TODO(student): maintain online softmax state and accumulate the output tile. +} + +torch::Tensor flash_attention_fwd_cuda( + torch::Tensor q, + torch::Tensor k, + torch::Tensor v, + bool causal) { + LAB_CHECK_CUDA(q); + LAB_CHECK_CUDA(k); + LAB_CHECK_CUDA(v); + LAB_CHECK_CONTIGUOUS(q); + LAB_CHECK_CONTIGUOUS(k); + LAB_CHECK_CONTIGUOUS(v); + TORCH_CHECK(q.sizes() == k.sizes(), "q and k must match"); + TORCH_CHECK(q.sizes() == v.sizes(), "q and v must match"); + TORCH_CHECK(q.dim() == 4, "flash_attention_fwd_cuda expects [batch, heads, seq, dim]"); + TORCH_CHECK(q.scalar_type() == torch::kFloat32, "flash_attention_fwd_cuda currently assumes float32"); + + TORCH_CHECK( + false, + "TODO(student): implement flash_attention_fwd_cuda in kernels/cuda/src/flash_attention_fwd.cu."); + return torch::Tensor(); +} + +} // namespace kernel_lab diff --git a/kernels/cuda/src/online_softmax.cu b/kernels/cuda/src/online_softmax.cu new file mode 100644 index 0000000..ad6e975 --- /dev/null +++ b/kernels/cuda/src/online_softmax.cu @@ -0,0 +1,36 @@ +#include "../include/common.h" +#include "../include/cuda_utils.h" + +namespace kernel_lab { + +__global__ void online_softmax_kernel( + const float* x, + float* out, + int64_t num_rows, + int64_t num_cols) { + int row = blockIdx.x; + if (row >= num_rows) { + return; + } + + // TODO(student): maintain running max and running sum across column tiles. + // TODO(student): write the normalized row after finishing the recurrence. + (void)x; + (void)out; + (void)num_rows; + (void)num_cols; +} + +torch::Tensor online_softmax_cuda(torch::Tensor x) { + LAB_CHECK_CUDA(x); + LAB_CHECK_CONTIGUOUS(x); + TORCH_CHECK(x.dim() == 2, "online_softmax_cuda expects a 2D tensor"); + TORCH_CHECK(x.scalar_type() == torch::kFloat32, "online_softmax_cuda currently assumes float32"); + + TORCH_CHECK( + false, + "TODO(student): implement online_softmax_cuda in kernels/cuda/src/online_softmax.cu."); + return torch::Tensor(); +} + +} // namespace kernel_lab diff --git a/kernels/cuda/src/row_softmax.cu b/kernels/cuda/src/row_softmax.cu new file mode 100644 index 0000000..2a64f70 --- /dev/null +++ b/kernels/cuda/src/row_softmax.cu @@ -0,0 +1,37 @@ +#include "../include/common.h" +#include "../include/cuda_utils.h" + +namespace kernel_lab { + +__global__ void row_softmax_kernel( + const float* x, + float* out, + int64_t num_rows, + int64_t num_cols) { + int row = blockIdx.x; + if (row >= num_rows) { + return; + } + + // TODO(student): decide whether one block owns one row or one row tile. + // TODO(student): compute the row max for numerical stability. + // TODO(student): compute exp(x - max), reduce the sum, and normalize. + (void)x; + (void)out; + (void)num_rows; + (void)num_cols; +} + +torch::Tensor row_softmax_cuda(torch::Tensor x) { + LAB_CHECK_CUDA(x); + LAB_CHECK_CONTIGUOUS(x); + TORCH_CHECK(x.dim() == 2, "row_softmax_cuda expects a 2D tensor"); + TORCH_CHECK(x.scalar_type() == torch::kFloat32, "row_softmax_cuda currently assumes float32"); + + TORCH_CHECK( + false, + "TODO(student): implement row_softmax_cuda in kernels/cuda/src/row_softmax.cu."); + return torch::Tensor(); +} + +} // namespace kernel_lab diff --git a/kernels/cuda/src/tiled_matmul.cu b/kernels/cuda/src/tiled_matmul.cu new file mode 100644 index 0000000..2679bb6 --- /dev/null +++ b/kernels/cuda/src/tiled_matmul.cu @@ -0,0 +1,40 @@ +#include "../include/common.h" +#include "../include/cuda_utils.h" + +namespace kernel_lab { + +__global__ void tiled_matmul_kernel( + const float* a, + const float* b, + float* c, + int64_t m, + int64_t n, + int64_t k) { + // TODO(student): map blockIdx/threadIdx to a C tile. + // TODO(student): cooperatively load A and B tiles into shared memory. + // TODO(student): accumulate partial products across the K dimension. + (void)a; + (void)b; + (void)c; + (void)m; + (void)n; + (void)k; +} + +torch::Tensor tiled_matmul_cuda(torch::Tensor a, torch::Tensor b) { + LAB_CHECK_CUDA(a); + LAB_CHECK_CUDA(b); + LAB_CHECK_CONTIGUOUS(a); + LAB_CHECK_CONTIGUOUS(b); + TORCH_CHECK(a.dim() == 2 && b.dim() == 2, "tiled_matmul_cuda expects 2D tensors"); + TORCH_CHECK(a.size(1) == b.size(0), "inner dimensions must match"); + TORCH_CHECK(a.scalar_type() == torch::kFloat32, "tiled_matmul_cuda currently assumes float32"); + TORCH_CHECK(b.scalar_type() == torch::kFloat32, "tiled_matmul_cuda currently assumes float32"); + + TORCH_CHECK( + false, + "TODO(student): implement tiled_matmul_cuda in kernels/cuda/src/tiled_matmul.cu."); + return torch::Tensor(); +} + +} // namespace kernel_lab diff --git a/kernels/cuda/src/vector_add.cu b/kernels/cuda/src/vector_add.cu new file mode 100644 index 0000000..99c77a1 --- /dev/null +++ b/kernels/cuda/src/vector_add.cu @@ -0,0 +1,35 @@ +#include "../include/common.h" +#include "../include/cuda_utils.h" + +namespace kernel_lab { + +__global__ void vector_add_kernel( + const float* x, + const float* y, + float* out, + int64_t numel) { + int64_t global_idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (global_idx >= numel) { + return; + } + + (void)x; + (void)y; + (void)out; + (void)numel; + // TODO(student): replace this placeholder with the real vector-add math. + // Hint: one thread should own one element for the first implementation. +} + +torch::Tensor vector_add_cuda(torch::Tensor x, torch::Tensor y) { + check_cuda_pair(x, y); + LAB_CHECK_SAME_SHAPE(x, y); + TORCH_CHECK(x.scalar_type() == torch::kFloat32, "vector_add_cuda currently assumes float32"); + + TORCH_CHECK( + false, + "TODO(student): implement vector_add_cuda in kernels/cuda/src/vector_add.cu and then launch the kernel."); + return torch::Tensor(); +} + +} // namespace kernel_lab diff --git a/kernels/triton/__init__.py b/kernels/triton/__init__.py new file mode 100644 index 0000000..bc218aa --- /dev/null +++ b/kernels/triton/__init__.py @@ -0,0 +1,2 @@ +"""Triton learner skeletons.""" + diff --git a/kernels/triton/flash_attention_fwd.py b/kernels/triton/flash_attention_fwd.py new file mode 100644 index 0000000..30df4a4 --- /dev/null +++ b/kernels/triton/flash_attention_fwd.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +import torch + +try: + import triton + import triton.language as tl +except ImportError: # pragma: no cover - depends on local environment + triton = None + tl = None + + +TRITON_AVAILABLE = triton is not None + + +if TRITON_AVAILABLE: + + @triton.jit + def flash_attention_fwd_kernel( + q_ptr, + k_ptr, + v_ptr, + out_ptr, + seq_len, + head_dim, + stride_q_batch, + stride_q_head, + stride_q_seq, + stride_q_dim, + stride_k_batch, + stride_k_head, + stride_k_seq, + stride_k_dim, + stride_v_batch, + stride_v_head, + stride_v_seq, + stride_v_dim, + stride_out_batch, + stride_out_head, + stride_out_seq, + stride_out_dim, + causal, + block_q: tl.constexpr, + block_k: tl.constexpr, + block_d: tl.constexpr, + ): + pid_q = tl.program_id(axis=0) + pid_bh = tl.program_id(axis=1) + # TODO(student): map pid_q and pid_bh to a batch/head/query tile. + # TODO(student): load Q, K, and V blocks. + # TODO(student): compute scores for the current block pair. + # TODO(student): apply optional causal masking. + # TODO(student): update online softmax state and accumulate the output block. + # TODO(student): store the final output tile. + pass + + +def triton_flash_attention_fwd( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + causal: bool = False, + block_q: int = 64, + block_k: int = 64, +) -> torch.Tensor: + if not TRITON_AVAILABLE: + raise RuntimeError("Triton is not installed in this environment.") + if q.shape != k.shape or q.shape != v.shape: + raise ValueError(f"q, k, v must match; got {q.shape}, {k.shape}, {v.shape}") + if q.ndim != 4: + raise ValueError("expected [batch, heads, seq, dim] inputs") + if not q.is_cuda or not k.is_cuda or not v.is_cuda: + raise ValueError("Triton kernels in this lab expect CUDA tensors.") + raise NotImplementedError("TODO(student): implement the FlashAttention forward launch.") + diff --git a/kernels/triton/online_softmax.py b/kernels/triton/online_softmax.py new file mode 100644 index 0000000..328c818 --- /dev/null +++ b/kernels/triton/online_softmax.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import torch + +try: + import triton + import triton.language as tl +except ImportError: # pragma: no cover - depends on local environment + triton = None + tl = None + + +TRITON_AVAILABLE = triton is not None + + +if TRITON_AVAILABLE: + + @triton.jit + def online_softmax_kernel( + x_ptr, + out_ptr, + num_cols, + stride_x_row, + stride_out_row, + block_size: tl.constexpr, + ): + row_idx = tl.program_id(axis=0) + # TODO(student): maintain running max and running sum for this row. + # TODO(student): process the row in blocks rather than assuming all columns fit at once. + # TODO(student): write the final normalized probabilities. + pass + + +def triton_online_softmax(x: torch.Tensor, block_size: int = 128) -> torch.Tensor: + if not TRITON_AVAILABLE: + raise RuntimeError("Triton is not installed in this environment.") + if x.ndim != 2: + raise ValueError(f"expected 2D input, got {tuple(x.shape)}") + if not x.is_cuda: + raise ValueError("Triton kernels in this lab expect CUDA tensors.") + raise NotImplementedError("TODO(student): implement online softmax in Triton.") + diff --git a/kernels/triton/row_softmax.py b/kernels/triton/row_softmax.py new file mode 100644 index 0000000..db8dd27 --- /dev/null +++ b/kernels/triton/row_softmax.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +import torch + +try: + import triton + import triton.language as tl +except ImportError: # pragma: no cover - depends on local environment + triton = None + tl = None + + +TRITON_AVAILABLE = triton is not None + + +if TRITON_AVAILABLE: + + @triton.jit + def row_softmax_kernel( + x_ptr, + out_ptr, + num_cols, + stride_x_row, + stride_out_row, + block_size: tl.constexpr, + ): + row_idx = tl.program_id(axis=0) + col_offsets = tl.arange(0, block_size) + # TODO(student): convert row_idx and col_offsets into pointers for this row. + # TODO(student): load a row with masking. + # TODO(student): subtract the row max for stability. + # TODO(student): exponentiate, sum, and normalize. + # TODO(student): store the normalized row. + pass + + +def triton_row_softmax(x: torch.Tensor, block_size: int = 128) -> torch.Tensor: + if not TRITON_AVAILABLE: + raise RuntimeError("Triton is not installed in this environment.") + if x.ndim != 2: + raise ValueError(f"expected 2D input, got {tuple(x.shape)}") + if not x.is_cuda: + raise ValueError("Triton kernels in this lab expect CUDA tensors.") + raise NotImplementedError("TODO(student): implement row-wise softmax launch logic.") + diff --git a/kernels/triton/tiled_matmul.py b/kernels/triton/tiled_matmul.py new file mode 100644 index 0000000..9059458 --- /dev/null +++ b/kernels/triton/tiled_matmul.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +import torch + +try: + import triton + import triton.language as tl +except ImportError: # pragma: no cover - depends on local environment + triton = None + tl = None + + +TRITON_AVAILABLE = triton is not None + + +if TRITON_AVAILABLE: + + @triton.jit + def tiled_matmul_kernel( + a_ptr, + b_ptr, + c_ptr, + m, + n, + k, + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + block_m: tl.constexpr, + block_n: tl.constexpr, + block_k: tl.constexpr, + ): + pid_m = tl.program_id(axis=0) + pid_n = tl.program_id(axis=1) + # TODO(student): compute the tile owned by this program instance. + # TODO(student): loop over K tiles and accumulate partial products. + # TODO(student): use masking on edge tiles. + # TODO(student): store the output tile. + pass + + +def triton_tiled_matmul( + a: torch.Tensor, + b: torch.Tensor, + block_m: int = 64, + block_n: int = 64, + block_k: int = 32, +) -> torch.Tensor: + if not TRITON_AVAILABLE: + raise RuntimeError("Triton is not installed in this environment.") + if a.ndim != 2 or b.ndim != 2: + raise ValueError("expected two 2D tensors") + if a.shape[1] != b.shape[0]: + raise ValueError(f"incompatible shapes: {a.shape} and {b.shape}") + if not a.is_cuda or not b.is_cuda: + raise ValueError("Triton kernels in this lab expect CUDA tensors.") + raise NotImplementedError("TODO(student): implement the tiled Triton matmul path.") + diff --git a/kernels/triton/vector_add.py b/kernels/triton/vector_add.py new file mode 100644 index 0000000..d5a2bfc --- /dev/null +++ b/kernels/triton/vector_add.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +import torch + +try: + import triton + import triton.language as tl +except ImportError: # pragma: no cover - depends on local environment + triton = None + tl = None + + +TRITON_AVAILABLE = triton is not None + + +if TRITON_AVAILABLE: + + @triton.jit + def vector_add_kernel( + x_ptr, + y_ptr, + out_ptr, + num_elements, + block_size: tl.constexpr, + ): + pid = tl.program_id(axis=0) + offsets = pid * block_size + tl.arange(0, block_size) + mask = offsets < num_elements + # TODO(student): load x and y using masked tl.load calls. + # TODO(student): add the vectors. + # TODO(student): write the result with tl.store. + pass + + +def triton_vector_add(x: torch.Tensor, y: torch.Tensor, block_size: int = 1024) -> torch.Tensor: + """Student entrypoint for the Triton vector add task.""" + if not TRITON_AVAILABLE: + raise RuntimeError("Triton is not installed in this environment.") + if x.shape != y.shape: + raise ValueError(f"shape mismatch: {x.shape} vs {y.shape}") + if not x.is_cuda or not y.is_cuda: + raise ValueError("Triton kernels in this lab expect CUDA tensors.") + raise NotImplementedError("TODO(student): launch vector_add_kernel and return the output tensor.") + diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..69a527e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,45 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "kernel-lab" +version = "0.1.0" +description = "Educational GPU kernel lab for PyTorch, Triton, and CUDA." +readme = "README.md" +requires-python = ">=3.10" +dependencies = [ + "numpy>=1.26", + "torch>=2.10", + "triton>=3.0", + "pytest>=8.0", + "packaging>=24.0", + "cmake>=3.25", + "ninja>=1.11", +] + +[tool.uv.sources] +torch = { index = "pytorch-cu128" } + +[[tool.uv.index]] +name = "aliyun-pypi" +url = "https://mirrors.aliyun.com/pypi/simple/" +default = true + +[[tool.uv.index]] +name = "pytorch-cu128" +url = "https://download.pytorch.org/whl/cu128" +explicit = true + +[tool.setuptools.packages.find] +include = ["reference*", "kernels*"] + +[tool.pytest.ini_options] +addopts = "-ra --import-mode=importlib" +testpaths = ["tests", "tasks"] +markers = [ + "reference: tests for plain PyTorch references", + "skeleton: tests validating learner skeleton behavior", + "cuda_required: tests that need CUDA", + "triton_required: tests that need Triton", +] diff --git a/reference/__init__.py b/reference/__init__.py new file mode 100644 index 0000000..1626e19 --- /dev/null +++ b/reference/__init__.py @@ -0,0 +1,2 @@ +"""Reference PyTorch implementations used throughout the lab.""" + diff --git a/reference/torch_attention.py b/reference/torch_attention.py new file mode 100644 index 0000000..0ed80a1 --- /dev/null +++ b/reference/torch_attention.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +import math + +import torch + + +def torch_attention( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + causal: bool = False, +) -> torch.Tensor: + """Reference scaled dot-product attention forward pass.""" + if q.ndim != 4 or k.ndim != 4 or v.ndim != 4: + raise ValueError("expected tensors shaped [batch, heads, seq, dim]") + if q.shape != k.shape or q.shape != v.shape: + raise ValueError(f"q, k, v must have matching shapes; got {q.shape}, {k.shape}, {v.shape}") + + dim = q.shape[-1] + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(dim) + if causal: + seq = q.shape[-2] + mask = torch.triu( + torch.ones((seq, seq), dtype=torch.bool, device=q.device), diagonal=1 + ) + scores = scores.masked_fill(mask, float("-inf")) + probs = torch.softmax(scores, dim=-1) + return torch.matmul(probs, v) + diff --git a/reference/torch_matmul.py b/reference/torch_matmul.py new file mode 100644 index 0000000..5242b86 --- /dev/null +++ b/reference/torch_matmul.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +import torch + + +def torch_matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: + """Reference matrix multiplication with simple shape validation.""" + if a.ndim != 2 or b.ndim != 2: + raise ValueError("torch_matmul expects two 2D tensors") + if a.shape[1] != b.shape[0]: + raise ValueError(f"incompatible shapes: {a.shape} and {b.shape}") + return a @ b + diff --git a/reference/torch_online_softmax.py b/reference/torch_online_softmax.py new file mode 100644 index 0000000..96d8d45 --- /dev/null +++ b/reference/torch_online_softmax.py @@ -0,0 +1,25 @@ +from __future__ import annotations + +import torch + + +def torch_online_softmax(x: torch.Tensor) -> torch.Tensor: + """Reference online-softmax derivation implemented with an explicit loop.""" + if x.ndim != 2: + raise ValueError(f"expected a 2D tensor, got shape {tuple(x.shape)}") + + running_max = torch.full( + (x.shape[0],), float("-inf"), dtype=x.dtype, device=x.device + ) + running_sum = torch.zeros((x.shape[0],), dtype=x.dtype, device=x.device) + + for col in range(x.shape[1]): + current = x[:, col] + new_max = torch.maximum(running_max, current) + old_scale = torch.exp(running_max - new_max) + current_scale = torch.exp(current - new_max) + running_sum = running_sum * old_scale + current_scale + running_max = new_max + + return torch.exp(x - running_max[:, None]) / running_sum[:, None] + diff --git a/reference/torch_row_softmax.py b/reference/torch_row_softmax.py new file mode 100644 index 0000000..ecd183b --- /dev/null +++ b/reference/torch_row_softmax.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import torch + + +def torch_row_softmax(x: torch.Tensor) -> torch.Tensor: + """Numerically stable row-wise softmax for 2D inputs.""" + if x.ndim != 2: + raise ValueError(f"expected a 2D tensor, got shape {tuple(x.shape)}") + row_max = x.max(dim=1, keepdim=True).values + shifted = x - row_max + exp_shifted = shifted.exp() + row_sum = exp_shifted.sum(dim=1, keepdim=True) + return exp_shifted / row_sum + diff --git a/reference/torch_vector_add.py b/reference/torch_vector_add.py new file mode 100644 index 0000000..0924224 --- /dev/null +++ b/reference/torch_vector_add.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +import torch + + +def torch_vector_add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + """Reference vector add with explicit shape checks.""" + if x.shape != y.shape: + raise ValueError(f"shape mismatch: {x.shape} vs {y.shape}") + return x + y + diff --git a/tasks/00_env_sanity/__init__.py b/tasks/00_env_sanity/__init__.py new file mode 100644 index 0000000..247e6fc --- /dev/null +++ b/tasks/00_env_sanity/__init__.py @@ -0,0 +1,2 @@ +"""Environment sanity task.""" + diff --git a/tasks/00_env_sanity/checklist.md b/tasks/00_env_sanity/checklist.md new file mode 100644 index 0000000..fa5a415 --- /dev/null +++ b/tasks/00_env_sanity/checklist.md @@ -0,0 +1,13 @@ +# Environment Checklist + +- PyTorch imports successfully +- `torch.cuda.is_available()` is `True` +- At least one CUDA device is visible +- The GPU name matches the machine you expect to be using +- Device capability is printed and recorded +- Triton imports successfully, or you know why it does not +- `torch.version.cuda` is visible when using CUDA-enabled PyTorch +- `nvcc --version` works if you plan to build the CUDA extension +- `nvidia-smi` works if the driver stack is installed + +If any line above fails, fix that before working on later tasks. diff --git a/tasks/00_env_sanity/spec.md b/tasks/00_env_sanity/spec.md new file mode 100644 index 0000000..9810069 --- /dev/null +++ b/tasks/00_env_sanity/spec.md @@ -0,0 +1,46 @@ +# Task 00: Environment Sanity + +## 1. Problem Statement + +Confirm that your machine can see the GPU software stack needed for the rest of the lab. + +## 2. Expected Input/Output Shapes + +This task is informational rather than tensor-shaped. The outputs are environment facts: + +- PyTorch version +- CUDA availability +- Triton import status +- GPU name +- device capability +- toolkit and driver hints when available + +## 3. Performance Intuition + +Do not benchmark anything yet. First confirm that the environment is what you think it is. + +## 4. Memory Access Discussion + +Not applicable yet. The point is to avoid debugging kernels when the real problem is a mismatched driver or toolkit. + +## 5. What Triton Is Abstracting + +Even importing Triton depends on a compatible Python, PyTorch, driver, and GPU stack. + +## 6. What CUDA Makes Explicit + +CUDA makes the toolkit and architecture targeting explicit. Keep that explicit throughout this repo. + +## 7. Reflection Questions + +- What exact GPU name does the system report? +- What device capability does PyTorch report? +- Does Triton import cleanly? +- Which part of the stack would you inspect first if CUDA is unavailable? + +## 8. Implementation Checklist + +- Run `python tools/check_env.py` +- Run `python tools/print_device_info.py` +- Write down the reported capability +- Set `KERNEL_LAB_CUDA_ARCH` explicitly if you need to change architecture targeting diff --git a/tasks/01_vector_add/__init__.py b/tasks/01_vector_add/__init__.py new file mode 100644 index 0000000..9a154c3 --- /dev/null +++ b/tasks/01_vector_add/__init__.py @@ -0,0 +1,2 @@ +"""Vector add task.""" + diff --git a/tasks/01_vector_add/bench.py b/tasks/01_vector_add/bench.py new file mode 100644 index 0000000..6d7f9c8 --- /dev/null +++ b/tasks/01_vector_add/bench.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +import statistics +import sys +import time +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[2] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +import torch + +from kernels.triton.vector_add import triton_vector_add +from reference.torch_vector_add import torch_vector_add + + +def benchmark(fn, *args, warmup: int = 5, reps: int = 25) -> float: + for _ in range(warmup): + fn(*args) + if args[0].is_cuda: + torch.cuda.synchronize() + times_ms = [] + for _ in range(reps): + if args[0].is_cuda: + torch.cuda.synchronize() + start = time.perf_counter() + fn(*args) + if args[0].is_cuda: + torch.cuda.synchronize() + times_ms.append((time.perf_counter() - start) * 1e3) + return statistics.median(times_ms) + + +def main() -> None: + device = "cuda" if torch.cuda.is_available() else "cpu" + x = torch.randn(1 << 20, device=device) + y = torch.randn(1 << 20, device=device) + ref_ms = benchmark(torch_vector_add, x, y) + print(f"torch_vector_add: {ref_ms:.3f} ms") + if device == "cuda": + try: + triton_ms = benchmark(triton_vector_add, x, y) + print(f"triton_vector_add: {triton_ms:.3f} ms") + except (NotImplementedError, RuntimeError) as exc: + print(f"triton_vector_add: skipped ({exc})") + + +if __name__ == "__main__": + main() diff --git a/tasks/01_vector_add/cuda_skeleton.cu b/tasks/01_vector_add/cuda_skeleton.cu new file mode 100644 index 0000000..736f4cf --- /dev/null +++ b/tasks/01_vector_add/cuda_skeleton.cu @@ -0,0 +1,10 @@ +// Workbook-local CUDA sketch for vector add. +// +// The repository-level implementation lives in kernels/cuda/src/vector_add.cu. +// Read this side by side with the Triton version. + +// TODO(student): +// 1. Compute global_idx from blockIdx.x, blockDim.x, and threadIdx.x. +// 2. Guard the tail with if (global_idx < numel). +// 3. Load x[global_idx] and y[global_idx]. +// 4. Store the sum. diff --git a/tasks/01_vector_add/spec.md b/tasks/01_vector_add/spec.md new file mode 100644 index 0000000..4aaa336 --- /dev/null +++ b/tasks/01_vector_add/spec.md @@ -0,0 +1,40 @@ +# Task 01: Vector Add + +## 1. Problem Statement + +Implement `out[i] = x[i] + y[i]` in both Triton and CUDA, then compare both against the PyTorch reference. + +## 2. Expected Input/Output Shapes + +- Input: two tensors with identical 1D or flattened shapes +- Output: one tensor with the same shape + +## 3. Performance Intuition + +Vector add is simple enough that launch overhead and memory bandwidth dominate quickly. It is a good place to learn indexing before the math becomes interesting. + +## 4. Memory Access Discussion + +This kernel should read `x[i]` and `y[i]` once and write `out[i]` once. The main thing to inspect is whether neighboring threads or lanes access neighboring elements. + +## 5. What Triton Is Abstracting + +Triton lets you express one block of contiguous offsets with `program_id` and `tl.arange`, then apply a mask on the tail. + +## 6. What CUDA Makes Explicit + +CUDA makes you compute `global_idx` from block and thread indices yourself and write the boundary check explicitly. + +## 7. Reflection Questions + +- What is the exact correspondence between `program_id` and `blockIdx.x` here? +- Why is a mask or bounds check required on the final block? +- How would the ownership change if one thread handled multiple elements? + +## 8. Implementation Checklist + +- Confirm the reference implementation +- Fill in the Triton masked loads, add, and store +- Fill in the CUDA thread ownership and store +- Test small and non-multiple-of-block-size shapes +- Benchmark bandwidth on larger vectors diff --git a/tasks/01_vector_add/test_task.py b/tasks/01_vector_add/test_task.py new file mode 100644 index 0000000..36f1289 --- /dev/null +++ b/tasks/01_vector_add/test_task.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +import pytest +import torch + +from kernels.triton.vector_add import triton_vector_add +from reference.torch_vector_add import torch_vector_add + + +def _run_impl_or_skip(fn, *args): + try: + return fn(*args) + except NotImplementedError: + pytest.skip("implementation is still TODO") + except RuntimeError as exc: + pytest.skip(str(exc)) + + +@pytest.mark.reference +def test_vector_add_reference_matches_torch(): + x = torch.randn(257) + y = torch.randn(257) + out = torch_vector_add(x, y) + torch.testing.assert_close(out, x + y) + + +@pytest.mark.triton_required +@pytest.mark.skeleton +def test_triton_vector_add_if_available(): + if not torch.cuda.is_available(): + pytest.skip("CUDA is not available") + x = torch.randn(513, device="cuda") + y = torch.randn(513, device="cuda") + out = _run_impl_or_skip(triton_vector_add, x, y) + torch.testing.assert_close(out, x + y) diff --git a/tasks/01_vector_add/triton_skeleton.py b/tasks/01_vector_add/triton_skeleton.py new file mode 100644 index 0000000..661d0a8 --- /dev/null +++ b/tasks/01_vector_add/triton_skeleton.py @@ -0,0 +1,19 @@ +"""Workbook-local Triton sketch for vector add. + +The repository-level implementation lives in kernels/triton/vector_add.py. +Use this file as a short-form scratchpad before editing the real kernel. +""" + + +def notes() -> str: + return """ +TODO(student): +1. Map one Triton program instance to one contiguous block of elements. +2. Compute offsets with pid * BLOCK_SIZE + arange. +3. Mask the tail. +4. Load x and y, add them, store the result. +""" + + +if __name__ == "__main__": + print(notes()) diff --git a/tasks/02_row_softmax/__init__.py b/tasks/02_row_softmax/__init__.py new file mode 100644 index 0000000..2440342 --- /dev/null +++ b/tasks/02_row_softmax/__init__.py @@ -0,0 +1,2 @@ +"""Row softmax task.""" + diff --git a/tasks/02_row_softmax/bench.py b/tasks/02_row_softmax/bench.py new file mode 100644 index 0000000..ce272e8 --- /dev/null +++ b/tasks/02_row_softmax/bench.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +import statistics +import sys +import time +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[2] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +import torch + +from kernels.triton.row_softmax import triton_row_softmax +from reference.torch_row_softmax import torch_row_softmax + + +def benchmark(fn, *args, warmup: int = 5, reps: int = 25) -> float: + for _ in range(warmup): + fn(*args) + if args[0].is_cuda: + torch.cuda.synchronize() + times_ms = [] + for _ in range(reps): + if args[0].is_cuda: + torch.cuda.synchronize() + start = time.perf_counter() + fn(*args) + if args[0].is_cuda: + torch.cuda.synchronize() + times_ms.append((time.perf_counter() - start) * 1e3) + return statistics.median(times_ms) + + +def main() -> None: + device = "cuda" if torch.cuda.is_available() else "cpu" + x = torch.randn(4096, 1024, device=device) + ref_ms = benchmark(torch_row_softmax, x) + print(f"torch_row_softmax: {ref_ms:.3f} ms") + if device == "cuda": + try: + triton_ms = benchmark(triton_row_softmax, x) + print(f"triton_row_softmax: {triton_ms:.3f} ms") + except (NotImplementedError, RuntimeError) as exc: + print(f"triton_row_softmax: skipped ({exc})") + + +if __name__ == "__main__": + main() diff --git a/tasks/02_row_softmax/cuda_skeleton.cu b/tasks/02_row_softmax/cuda_skeleton.cu new file mode 100644 index 0000000..bc1529a --- /dev/null +++ b/tasks/02_row_softmax/cuda_skeleton.cu @@ -0,0 +1,11 @@ +// Workbook-local CUDA sketch for row softmax. +// +// Reflection prompt: +// Softmax is usually bandwidth-bound because the math is cheap but the rows are read and written a lot. +// Keep track of how many global-memory passes your implementation needs. + +// TODO(student): +// 1. Assign one block or block tile to a row. +// 2. Compute the row max. +// 3. Compute the sum of exp(x - row_max). +// 4. Normalize the row. diff --git a/tasks/02_row_softmax/spec.md b/tasks/02_row_softmax/spec.md new file mode 100644 index 0000000..bfaec7d --- /dev/null +++ b/tasks/02_row_softmax/spec.md @@ -0,0 +1,40 @@ +# Task 02: Row Softmax + +## 1. Problem Statement + +Implement a row-wise softmax with numerical stability and compare naive and fused viewpoints. + +## 2. Expected Input/Output Shapes + +- Input: a 2D tensor `[num_rows, num_cols]` +- Output: a 2D tensor with the same shape + +## 3. Performance Intuition + +Softmax is often bandwidth-bound because each element is read several times unless you fuse work carefully. The arithmetic is cheap relative to the data movement. + +## 4. Memory Access Discussion + +A naive implementation may read rows multiple times: once for the max, once for the sum of exponentials, and once for normalization. Think about which intermediate values can stay on chip. + +## 5. What Triton Is Abstracting + +Triton makes it easy to load a row block, apply masked operations, and reduce across the block with tensor-style code. + +## 6. What CUDA Makes Explicit + +CUDA forces you to decide where the row reduction lives: one block per row, multiple warps per row, or a tiled strategy. Shared-memory use and synchronization become explicit design choices. + +## 7. Reflection Questions + +- Why is max subtraction required for stable softmax? +- Why is softmax often bandwidth-bound rather than compute-bound? +- Which intermediate quantities would you prefer not to write back to global memory? + +## 8. Implementation Checklist + +- Validate the reference row softmax +- Fill in Triton row loading, max reduction, sum reduction, and normalization +- Fill in the CUDA reduction structure +- Test large positive and negative values +- Compare against `torch.softmax` diff --git a/tasks/02_row_softmax/test_task.py b/tasks/02_row_softmax/test_task.py new file mode 100644 index 0000000..d369508 --- /dev/null +++ b/tasks/02_row_softmax/test_task.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +import pytest +import torch + +from kernels.triton.row_softmax import triton_row_softmax +from reference.torch_row_softmax import torch_row_softmax + + +def _run_impl_or_skip(fn, *args): + try: + return fn(*args) + except NotImplementedError: + pytest.skip("implementation is still TODO") + except RuntimeError as exc: + pytest.skip(str(exc)) + + +@pytest.mark.reference +def test_row_softmax_reference_matches_torch(): + x = torch.randn(8, 17) + out = torch_row_softmax(x) + torch.testing.assert_close(out, torch.softmax(x, dim=1)) + + +@pytest.mark.reference +def test_row_softmax_reference_is_numerically_stable(): + x = torch.tensor([[1000.0, 1001.0, 1002.0], [-1000.0, -999.0, -998.0]]) + out = torch_row_softmax(x) + torch.testing.assert_close(out.sum(dim=1), torch.ones(2), atol=1e-6, rtol=1e-6) + + +@pytest.mark.triton_required +@pytest.mark.skeleton +def test_triton_row_softmax_if_available(): + if not torch.cuda.is_available(): + pytest.skip("CUDA is not available") + x = torch.randn(16, 63, device="cuda") + out = _run_impl_or_skip(triton_row_softmax, x) + torch.testing.assert_close(out, torch.softmax(x, dim=1), atol=1e-4, rtol=1e-4) diff --git a/tasks/02_row_softmax/triton_skeleton.py b/tasks/02_row_softmax/triton_skeleton.py new file mode 100644 index 0000000..37e714a --- /dev/null +++ b/tasks/02_row_softmax/triton_skeleton.py @@ -0,0 +1,20 @@ +"""Workbook-local Triton notes for row softmax.""" + + +def notes() -> str: + return """ +TODO(student): +1. Decide what one program instance owns: a whole row or a row tile. +2. Load a row with masking. +3. Compute row_max = max(x). +4. Compute exp(x - row_max), then the row sum. +5. Normalize and store. + +Reflection: +- Why does numerical stability matter here more than in vector add? +- Where does extra memory traffic appear in a naive multi-kernel approach? +""" + + +if __name__ == "__main__": + print(notes()) diff --git a/tasks/03_tiled_matmul/__init__.py b/tasks/03_tiled_matmul/__init__.py new file mode 100644 index 0000000..a15c94f --- /dev/null +++ b/tasks/03_tiled_matmul/__init__.py @@ -0,0 +1,2 @@ +"""Tiled matmul task.""" + diff --git a/tasks/03_tiled_matmul/bench.py b/tasks/03_tiled_matmul/bench.py new file mode 100644 index 0000000..184b5bc --- /dev/null +++ b/tasks/03_tiled_matmul/bench.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import statistics +import sys +import time +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[2] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +import torch + +from kernels.triton.tiled_matmul import triton_tiled_matmul +from reference.torch_matmul import torch_matmul + + +def benchmark(fn, *args, warmup: int = 5, reps: int = 20) -> float: + for _ in range(warmup): + fn(*args) + if args[0].is_cuda: + torch.cuda.synchronize() + times_ms = [] + for _ in range(reps): + if args[0].is_cuda: + torch.cuda.synchronize() + start = time.perf_counter() + fn(*args) + if args[0].is_cuda: + torch.cuda.synchronize() + times_ms.append((time.perf_counter() - start) * 1e3) + return statistics.median(times_ms) + + +def main() -> None: + device = "cuda" if torch.cuda.is_available() else "cpu" + for m, k, n in [(128, 128, 128), (512, 512, 512)]: + a = torch.randn(m, k, device=device) + b = torch.randn(k, n, device=device) + ref_ms = benchmark(torch_matmul, a, b) + print(f"torch_matmul {m}x{k}x{n}: {ref_ms:.3f} ms") + if device == "cuda": + try: + triton_ms = benchmark(triton_tiled_matmul, a, b) + print(f"triton_tiled_matmul {m}x{k}x{n}: {triton_ms:.3f} ms") + except (NotImplementedError, RuntimeError) as exc: + print(f"triton_tiled_matmul {m}x{k}x{n}: skipped ({exc})") + + +if __name__ == "__main__": + main() diff --git a/tasks/03_tiled_matmul/cuda_skeleton.cu b/tasks/03_tiled_matmul/cuda_skeleton.cu new file mode 100644 index 0000000..73da134 --- /dev/null +++ b/tasks/03_tiled_matmul/cuda_skeleton.cu @@ -0,0 +1,9 @@ +// Workbook-local CUDA sketch for tiled matmul. +// +// TODO(student): +// 1. Choose a block tile size, for example 16x16 or 32x32. +// 2. Load one A tile and one B tile into shared memory. +// 3. Synchronize. +// 4. Accumulate partial products. +// 5. Synchronize before loading the next tile. +// 6. Store the final C element or tile. diff --git a/tasks/03_tiled_matmul/spec.md b/tasks/03_tiled_matmul/spec.md new file mode 100644 index 0000000..898339a --- /dev/null +++ b/tasks/03_tiled_matmul/spec.md @@ -0,0 +1,51 @@ +# Task 03: Tiled Matmul + +## 1. Problem Statement + +Implement a tiled matrix multiplication and compare the tile abstraction in Triton with the explicit shared-memory strategy in CUDA. + +## 2. Expected Input/Output Shapes + +- Input `A`: `[M, K]` +- Input `B`: `[K, N]` +- Output `C`: `[M, N]` + +## 3. Performance Intuition + +Matmul becomes interesting once data reuse matters. Re-reading the same `A` and `B` values from global memory is expensive; tiling exists to reuse those values across many multiply-accumulate operations. + +## 4. Memory Access Discussion + +Think about which `A` tile and `B` tile each work unit needs. The performance win comes from moving those tiles into on-chip storage and reusing them before fetching the next tile. + +## 5. What Triton Is Abstracting + +Triton lets you think in output tiles and blocked pointer arithmetic. The tile loads and accumulations read like tensor operations. + +## 6. What CUDA Makes Explicit + +CUDA makes you choose block dimensions, allocate shared memory, manage cooperative loads, and synchronize between load and compute phases. + +## 7. Reflection Questions + +- Which values in `A` and `B` are reused across multiple output elements? +- Why does tiling reduce global-memory traffic? +- How does a Triton tile map to CUDA shared-memory tiles and threads? + +## 8. Implementation Checklist + +- Confirm the reference matmul +- Draw a block/tile diagram before coding +- Implement the Triton tile loop over `K` +- Implement the CUDA shared-memory tile loop +- Benchmark against `torch.matmul` on small and medium sizes + +## Tile Diagram Prompt + +Sketch: + +- one output tile `C[m0:m1, n0:n1]` +- the matching `A[m0:m1, k0:k1]` +- the matching `B[k0:k1, n0:n1]` + +That sketch should tell you what belongs in shared memory. diff --git a/tasks/03_tiled_matmul/test_task.py b/tasks/03_tiled_matmul/test_task.py new file mode 100644 index 0000000..a066714 --- /dev/null +++ b/tasks/03_tiled_matmul/test_task.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +import pytest +import torch + +from kernels.triton.tiled_matmul import triton_tiled_matmul +from reference.torch_matmul import torch_matmul + + +def _run_impl_or_skip(fn, *args): + try: + return fn(*args) + except NotImplementedError: + pytest.skip("implementation is still TODO") + except RuntimeError as exc: + pytest.skip(str(exc)) + + +@pytest.mark.reference +def test_tiled_matmul_reference_matches_torch(): + a = torch.randn(8, 16) + b = torch.randn(16, 12) + out = torch_matmul(a, b) + torch.testing.assert_close(out, a @ b) + + +@pytest.mark.triton_required +@pytest.mark.skeleton +def test_triton_tiled_matmul_if_available(): + if not torch.cuda.is_available(): + pytest.skip("CUDA is not available") + a = torch.randn(32, 48, device="cuda") + b = torch.randn(48, 40, device="cuda") + out = _run_impl_or_skip(triton_tiled_matmul, a, b) + torch.testing.assert_close(out, a @ b, atol=1e-3, rtol=1e-3) diff --git a/tasks/03_tiled_matmul/triton_skeleton.py b/tasks/03_tiled_matmul/triton_skeleton.py new file mode 100644 index 0000000..da70002 --- /dev/null +++ b/tasks/03_tiled_matmul/triton_skeleton.py @@ -0,0 +1,16 @@ +"""Workbook-local Triton notes for tiled matmul.""" + + +def notes() -> str: + return """ +TODO(student): +1. Map one program instance to one output tile. +2. Build row/col offsets for the tile. +3. Loop over K in block_k chunks. +4. Load A and B tiles, accumulate partial products. +5. Store the output tile with masking on edges. +""" + + +if __name__ == "__main__": + print(notes()) diff --git a/tasks/04_online_softmax/__init__.py b/tasks/04_online_softmax/__init__.py new file mode 100644 index 0000000..3e500cf --- /dev/null +++ b/tasks/04_online_softmax/__init__.py @@ -0,0 +1,2 @@ +"""Online softmax task.""" + diff --git a/tasks/04_online_softmax/bench.py b/tasks/04_online_softmax/bench.py new file mode 100644 index 0000000..8cf1057 --- /dev/null +++ b/tasks/04_online_softmax/bench.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +import statistics +import sys +import time +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[2] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +import torch + +from kernels.triton.online_softmax import triton_online_softmax +from reference.torch_online_softmax import torch_online_softmax + + +def benchmark(fn, *args, warmup: int = 5, reps: int = 25) -> float: + for _ in range(warmup): + fn(*args) + if args[0].is_cuda: + torch.cuda.synchronize() + times_ms = [] + for _ in range(reps): + if args[0].is_cuda: + torch.cuda.synchronize() + start = time.perf_counter() + fn(*args) + if args[0].is_cuda: + torch.cuda.synchronize() + times_ms.append((time.perf_counter() - start) * 1e3) + return statistics.median(times_ms) + + +def main() -> None: + device = "cuda" if torch.cuda.is_available() else "cpu" + x = torch.randn(2048, 2048, device=device) + ref_ms = benchmark(torch_online_softmax, x) + print(f"torch_online_softmax: {ref_ms:.3f} ms") + if device == "cuda": + try: + triton_ms = benchmark(triton_online_softmax, x) + print(f"triton_online_softmax: {triton_ms:.3f} ms") + except (NotImplementedError, RuntimeError) as exc: + print(f"triton_online_softmax: skipped ({exc})") + + +if __name__ == "__main__": + main() diff --git a/tasks/04_online_softmax/cuda_skeleton.cu b/tasks/04_online_softmax/cuda_skeleton.cu new file mode 100644 index 0000000..3f8b431 --- /dev/null +++ b/tasks/04_online_softmax/cuda_skeleton.cu @@ -0,0 +1,7 @@ +// Workbook-local CUDA sketch for online softmax. +// +// TODO(student): +// 1. Choose how one block owns one row or row tile. +// 2. Keep running_max and running_sum across column tiles. +// 3. Update the recurrence carefully for numerical stability. +// 4. Normalize the final row. diff --git a/tasks/04_online_softmax/spec.md b/tasks/04_online_softmax/spec.md new file mode 100644 index 0000000..c1ea4a5 --- /dev/null +++ b/tasks/04_online_softmax/spec.md @@ -0,0 +1,49 @@ +# Task 04: Online Softmax + +## 1. Problem Statement + +Implement the running max / running sum formulation of softmax and connect it to blockwise attention. + +## 2. Expected Input/Output Shapes + +- Input: `[num_rows, num_cols]` +- Output: `[num_rows, num_cols]` + +## 3. Performance Intuition + +The main goal is algorithmic structure rather than raw speed. Online softmax becomes powerful because it lets you process a row incrementally without materializing the full reduction context at once. + +## 4. Memory Access Discussion + +Think in column tiles. Each tile updates the running normalization state. This matters later when attention scores are processed block by block. + +## 5. What Triton Is Abstracting + +Triton can express the blocked recurrence with vectorized loads and tensor math while still letting you reason about per-row state. + +## 6. What CUDA Makes Explicit + +CUDA forces you to decide where the running max and running sum live and how threads cooperate to update them across tiles. + +## 7. Reflection Questions + +- Why is a running max needed instead of only a running sum? +- Why does online softmax enable FlashAttention-style blockwise computation? +- Which values must persist from one tile to the next? + +## 8. Implementation Checklist + +- Read the reference online softmax +- Derive the recurrence informally +- Implement the Triton blocked recurrence +- Implement the CUDA blocked recurrence +- Compare against full softmax on small shapes first + +## Informal Recurrence + +Given a previous state `(m_prev, l_prev)` and a new tile with max `m_tile` and denominator contribution `l_tile`, define: + +- `m_new = max(m_prev, m_tile)` +- `l_new = l_prev * exp(m_prev - m_new) + l_tile * exp(m_tile - m_new)` + +That is the key idea you will reuse in FlashAttention. diff --git a/tasks/04_online_softmax/test_task.py b/tasks/04_online_softmax/test_task.py new file mode 100644 index 0000000..a6d0899 --- /dev/null +++ b/tasks/04_online_softmax/test_task.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +import pytest +import torch + +from kernels.triton.online_softmax import triton_online_softmax +from reference.torch_online_softmax import torch_online_softmax + + +def _run_impl_or_skip(fn, *args): + try: + return fn(*args) + except NotImplementedError: + pytest.skip("implementation is still TODO") + except RuntimeError as exc: + pytest.skip(str(exc)) + + +@pytest.mark.reference +def test_online_softmax_reference_matches_torch(): + x = torch.randn(6, 19) + out = torch_online_softmax(x) + torch.testing.assert_close(out, torch.softmax(x, dim=1), atol=1e-5, rtol=1e-5) + + +@pytest.mark.triton_required +@pytest.mark.skeleton +def test_triton_online_softmax_if_available(): + if not torch.cuda.is_available(): + pytest.skip("CUDA is not available") + x = torch.randn(8, 97, device="cuda") + out = _run_impl_or_skip(triton_online_softmax, x) + torch.testing.assert_close(out, torch.softmax(x, dim=1), atol=1e-4, rtol=1e-4) diff --git a/tasks/04_online_softmax/triton_skeleton.py b/tasks/04_online_softmax/triton_skeleton.py new file mode 100644 index 0000000..03d42cc --- /dev/null +++ b/tasks/04_online_softmax/triton_skeleton.py @@ -0,0 +1,15 @@ +"""Workbook-local Triton notes for online softmax.""" + + +def notes() -> str: + return """ +TODO(student): +1. Keep running_max and running_sum for one row. +2. Process the row in blocks. +3. Update the recurrence after each block. +4. Normalize once the full row has been seen. +""" + + +if __name__ == "__main__": + print(notes()) diff --git a/tasks/05_flash_attention_fwd/__init__.py b/tasks/05_flash_attention_fwd/__init__.py new file mode 100644 index 0000000..1ccc440 --- /dev/null +++ b/tasks/05_flash_attention_fwd/__init__.py @@ -0,0 +1,2 @@ +"""Flash attention forward task.""" + diff --git a/tasks/05_flash_attention_fwd/bench.py b/tasks/05_flash_attention_fwd/bench.py new file mode 100644 index 0000000..d4a95f5 --- /dev/null +++ b/tasks/05_flash_attention_fwd/bench.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import statistics +import sys +import time +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[2] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +import torch + +from kernels.triton.flash_attention_fwd import triton_flash_attention_fwd +from reference.torch_attention import torch_attention + + +def benchmark(fn, *args, warmup: int = 5, reps: int = 20, **kwargs) -> float: + for _ in range(warmup): + fn(*args, **kwargs) + if args[0].is_cuda: + torch.cuda.synchronize() + times_ms = [] + for _ in range(reps): + if args[0].is_cuda: + torch.cuda.synchronize() + start = time.perf_counter() + fn(*args, **kwargs) + if args[0].is_cuda: + torch.cuda.synchronize() + times_ms.append((time.perf_counter() - start) * 1e3) + return statistics.median(times_ms) + + +def main() -> None: + device = "cuda" if torch.cuda.is_available() else "cpu" + q = torch.randn(2, 8, 128, 64, device=device) + k = torch.randn(2, 8, 128, 64, device=device) + v = torch.randn(2, 8, 128, 64, device=device) + ref_ms = benchmark(torch_attention, q, k, v, causal=False) + print(f"torch_attention: {ref_ms:.3f} ms") + if device == "cuda": + try: + triton_ms = benchmark(triton_flash_attention_fwd, q, k, v, causal=False) + print(f"triton_flash_attention_fwd: {triton_ms:.3f} ms") + except (NotImplementedError, RuntimeError) as exc: + print(f"triton_flash_attention_fwd: skipped ({exc})") + + +if __name__ == "__main__": + main() diff --git a/tasks/05_flash_attention_fwd/cuda_skeleton.cu b/tasks/05_flash_attention_fwd/cuda_skeleton.cu new file mode 100644 index 0000000..9db6de2 --- /dev/null +++ b/tasks/05_flash_attention_fwd/cuda_skeleton.cu @@ -0,0 +1,14 @@ +// Workbook-local CUDA sketch for FlashAttention forward. +// +// Map this against the Triton sketch: +// - Triton program_id for query tile -> CUDA block ownership +// - Triton block pointer loads -> CUDA cooperative global-to-shared loads +// - Triton masks -> explicit edge and causal checks +// - Triton implicit block math -> thread/block index arithmetic + +// TODO(student): +// 1. Assign a block to one batch/head/query tile. +// 2. Load a Q tile and loop over K/V tiles. +// 3. Compute score tiles and causal masking. +// 4. Update online softmax state. +// 5. Accumulate the output tile. diff --git a/tasks/05_flash_attention_fwd/spec.md b/tasks/05_flash_attention_fwd/spec.md new file mode 100644 index 0000000..b26687e --- /dev/null +++ b/tasks/05_flash_attention_fwd/spec.md @@ -0,0 +1,59 @@ +# Task 05: Flash Attention Forward + +## 1. Problem Statement + +Implement a learning-oriented forward-only FlashAttention-style kernel in both Triton and CUDA. + +## 2. Expected Input/Output Shapes + +- `Q`: `[batch, heads, seq_len, head_dim]` +- `K`: `[batch, heads, seq_len, head_dim]` +- `V`: `[batch, heads, seq_len, head_dim]` +- `Output`: `[batch, heads, seq_len, head_dim]` + +## 3. Performance Intuition + +The goal is to reduce memory traffic by avoiding full materialization of the score matrix. Correctness comes first. Performance work only matters after the blockwise algorithm is correct. + +## 4. Memory Access Discussion + +This task is about staged movement: + +- load a `Q` block +- iterate over `K` and `V` blocks +- compute score blocks +- update online normalization +- accumulate the output block + +Track where each quantity lives: global memory, registers, or shared memory. + +## 5. What Triton Is Abstracting + +Triton makes block pointers, program IDs, and masked block operations compact. Those abstractions still correspond to explicit memory ownership decisions. + +## 6. What CUDA Makes Explicit + +CUDA exposes thread-block mapping, shared-memory staging, synchronization, and reduction details directly. This is where the same algorithm becomes visibly lower level. + +## 7. Reflection Questions + +- How does online softmax avoid writing out the full score matrix? +- Which loop corresponds to iterating over key/value blocks? +- Where do causal masking and normalization interact? +- How does a Triton block pointer map to a CUDA shared-memory load phase? + +## 8. Implementation Checklist + +- Confirm the PyTorch reference on tiny shapes +- Trace the online softmax state update +- Implement one Triton blockwise forward path +- Implement one CUDA blockwise forward path +- Test non-causal first, then causal +- Benchmark only after small-shape correctness passes + +## Explicit Triton To CUDA Mapping + +- Triton `program_id(axis=0)` for query tiles maps to CUDA query-tile block ownership +- Triton `program_id(axis=1)` for batch/head maps to a flattened batch-head block index +- Triton block pointer math maps to shared-memory staging and pointer arithmetic +- Triton masked edge handling maps to explicit tail checks and mask branches diff --git a/tasks/05_flash_attention_fwd/test_task.py b/tasks/05_flash_attention_fwd/test_task.py new file mode 100644 index 0000000..3e4d563 --- /dev/null +++ b/tasks/05_flash_attention_fwd/test_task.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +import pytest +import torch + +from kernels.triton.flash_attention_fwd import triton_flash_attention_fwd +from reference.torch_attention import torch_attention + + +def _run_impl_or_skip(fn, *args, **kwargs): + try: + return fn(*args, **kwargs) + except NotImplementedError: + pytest.skip("implementation is still TODO") + except RuntimeError as exc: + pytest.skip(str(exc)) + + +@pytest.mark.reference +def test_attention_reference_small_shape(): + q = torch.randn(1, 2, 8, 16) + k = torch.randn(1, 2, 8, 16) + v = torch.randn(1, 2, 8, 16) + out = torch_attention(q, k, v, causal=False) + expected = torch.nn.functional.scaled_dot_product_attention(q, k, v, is_causal=False) + torch.testing.assert_close(out, expected, atol=1e-5, rtol=1e-5) + + +@pytest.mark.reference +def test_attention_reference_causal_small_shape(): + q = torch.randn(1, 1, 8, 16) + k = torch.randn(1, 1, 8, 16) + v = torch.randn(1, 1, 8, 16) + out = torch_attention(q, k, v, causal=True) + expected = torch.nn.functional.scaled_dot_product_attention(q, k, v, is_causal=True) + torch.testing.assert_close(out, expected, atol=1e-5, rtol=1e-5) + + +@pytest.mark.triton_required +@pytest.mark.skeleton +def test_triton_flash_attention_if_available(): + if not torch.cuda.is_available(): + pytest.skip("CUDA is not available") + q = torch.randn(1, 2, 16, 32, device="cuda") + k = torch.randn(1, 2, 16, 32, device="cuda") + v = torch.randn(1, 2, 16, 32, device="cuda") + out = _run_impl_or_skip(triton_flash_attention_fwd, q, k, v, causal=False) + expected = torch_attention(q, k, v, causal=False) + torch.testing.assert_close(out, expected, atol=2e-3, rtol=2e-3) diff --git a/tasks/05_flash_attention_fwd/triton_skeleton.py b/tasks/05_flash_attention_fwd/triton_skeleton.py new file mode 100644 index 0000000..13b3c7f --- /dev/null +++ b/tasks/05_flash_attention_fwd/triton_skeleton.py @@ -0,0 +1,19 @@ +"""Workbook-local Triton notes for FlashAttention forward.""" + + +def notes() -> str: + return """ +TODO(student): +1. Assign one program instance to one query block for one batch/head. +2. Load a Q block. +3. Iterate over K/V blocks. +4. Compute score blocks. +5. Apply optional causal masking. +6. Update running max and running sum. +7. Accumulate the output block. +8. Store the final output. +""" + + +if __name__ == "__main__": + print(notes()) diff --git a/tasks/06_pytorch_custom_op/__init__.py b/tasks/06_pytorch_custom_op/__init__.py new file mode 100644 index 0000000..42c05ea --- /dev/null +++ b/tasks/06_pytorch_custom_op/__init__.py @@ -0,0 +1,2 @@ +"""PyTorch custom op task.""" + diff --git a/tasks/06_pytorch_custom_op/extension_skeleton.py b/tasks/06_pytorch_custom_op/extension_skeleton.py new file mode 100644 index 0000000..9a4841d --- /dev/null +++ b/tasks/06_pytorch_custom_op/extension_skeleton.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[2] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +import torch + +from tools.lab_extension import build_extension + + +def main() -> None: + ext = build_extension(verbose=True) + if ext is None: + return + print("Extension loaded.") + print("Available torch.ops namespace:", hasattr(torch.ops, "kernel_lab")) + if hasattr(torch.ops, "kernel_lab"): + print("Registered ops:", dir(torch.ops.kernel_lab)) + + +if __name__ == "__main__": + main() diff --git a/tasks/06_pytorch_custom_op/opcheck_test.py b/tasks/06_pytorch_custom_op/opcheck_test.py new file mode 100644 index 0000000..b4e66be --- /dev/null +++ b/tasks/06_pytorch_custom_op/opcheck_test.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +import pytest +import torch + +from tools.lab_extension import build_extension + + +@pytest.mark.cuda_required +@pytest.mark.skeleton +def test_vector_add_opcheck_if_available(): + if not torch.cuda.is_available(): + pytest.skip("CUDA is not available") + ext = build_extension(verbose=False) + if ext is None or not hasattr(torch.ops, "kernel_lab"): + pytest.skip("extension is unavailable") + if not hasattr(torch.library, "opcheck"): + pytest.skip("torch.library.opcheck is unavailable") + + x = torch.randn(32, device="cuda") + y = torch.randn(32, device="cuda") + try: + torch.ops.kernel_lab.vector_add(x, y) + except Exception as exc: + pytest.skip(f"operator is not implemented yet: {exc}") + + torch.library.opcheck(torch.ops.kernel_lab.vector_add, (x, y)) diff --git a/tasks/06_pytorch_custom_op/spec.md b/tasks/06_pytorch_custom_op/spec.md new file mode 100644 index 0000000..f93414a --- /dev/null +++ b/tasks/06_pytorch_custom_op/spec.md @@ -0,0 +1,45 @@ +# Task 06: PyTorch Custom Op + +## 1. Problem Statement + +Expose a CUDA kernel as a PyTorch operator so Python code can call it and test it like any other operator. + +## 2. Expected Input/Output Shapes + +For the starter binding, use vector add: + +- `x`: `[N]` +- `y`: `[N]` +- output: `[N]` + +The same pattern can later be extended to the other operators. + +## 3. Performance Intuition + +The binding layer is not usually where the kernel time goes, but it determines whether you can test, benchmark, and profile the CUDA implementation from Python. + +## 4. Memory Access Discussion + +The binding itself does not optimize memory traffic; it passes tensors and dispatches the kernel. Still, the binding must preserve shape, dtype, device, and contiguity assumptions. + +## 5. What Triton Is Abstracting + +Triton often avoids a separate C++ binding layer because Python can launch the JIT kernel directly. + +## 6. What CUDA Makes Explicit + +CUDA plus PyTorch binding requires you to define function signatures, operator registration, and build integration explicitly. + +## 7. Reflection Questions + +- What assumptions should the binding validate before calling a CUDA kernel? +- Why is operator registration useful for testing and benchmarking? +- What changes once you want autograd support? + +## 8. Implementation Checklist + +- Read `kernels/cuda/binding/binding.cpp` +- Build or load the extension from Python +- Call the operator from `torch.ops.kernel_lab` +- Add correctness checks once the CUDA kernel is implemented +- Try `torch.library.opcheck` if your PyTorch build provides it diff --git a/tasks/07_profiling/__init__.py b/tasks/07_profiling/__init__.py new file mode 100644 index 0000000..83fe414 --- /dev/null +++ b/tasks/07_profiling/__init__.py @@ -0,0 +1 @@ +"""Profiling task.""" diff --git a/tasks/07_profiling/profile_examples.md b/tasks/07_profiling/profile_examples.md new file mode 100644 index 0000000..f46ee79 --- /dev/null +++ b/tasks/07_profiling/profile_examples.md @@ -0,0 +1,23 @@ +# Profiling Examples + +## Nsight Compute + +```bash +./tools/profile_ncu.sh python bench/bench_vector_add.py --device cuda --mode triton +./tools/profile_ncu.sh python bench/bench_softmax.py --device cuda --mode torch +``` + +## Nsight Systems + +```bash +./tools/profile_nsys.sh python bench/bench_matmul.py --device cuda --mode triton +./tools/profile_nsys.sh python bench/bench_attention.py --device cuda --mode torch +``` + +## First Things To Inspect + +- median runtime from the benchmark harness +- whether warmup was excluded +- whether kernels overlap or serialize +- whether memory throughput is near a practical ceiling +- whether a kernel launch is tiny enough that launch overhead matters diff --git a/tasks/07_profiling/spec.md b/tasks/07_profiling/spec.md new file mode 100644 index 0000000..c371ced --- /dev/null +++ b/tasks/07_profiling/spec.md @@ -0,0 +1,40 @@ +# Task 07: Profiling + +## 1. Problem Statement + +Profile one kernel at a time and learn to interpret the first few metrics before tuning anything. + +## 2. Expected Input/Output Shapes + +Use the same shapes as your benchmark harness so measurements stay comparable. + +## 3. Performance Intuition + +Profiling is how you turn guesses into evidence. Use it after correctness is established. + +## 4. Memory Access Discussion + +Profilers can tell you whether the kernel is limited by memory throughput, occupancy, or something else. Interpret those numbers in terms of the operator's access pattern. + +## 5. What Triton Is Abstracting + +Triton hides low-level details in code, but profilers still show the resulting kernels and hardware behavior. + +## 6. What CUDA Makes Explicit + +CUDA kernels expose their launch shapes, synchronization behavior, and memory hierarchy choices more directly, which can make profiler results easier to map back to code. + +## 7. Reflection Questions + +- Did you profile a single kernel or an entire script? +- Did you warm up before timing? +- Which metric was the first signal that the kernel was bandwidth-bound or compute-bound? + +## 8. Implementation Checklist + +- Pick one benchmark and one implementation +- Warm up first +- Synchronize before and after timing +- Run `ncu` and inspect a small set of metrics +- Run `nsys` and inspect the timeline +- Write down what you learned before changing the kernel diff --git a/tasks/__init__.py b/tasks/__init__.py new file mode 100644 index 0000000..19fcdc7 --- /dev/null +++ b/tasks/__init__.py @@ -0,0 +1,2 @@ +"""Workbook tasks.""" + diff --git a/tests/test_correctness.py b/tests/test_correctness.py new file mode 100644 index 0000000..9597507 --- /dev/null +++ b/tests/test_correctness.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +import math + +import torch + +from reference.torch_attention import torch_attention +from reference.torch_matmul import torch_matmul +from reference.torch_online_softmax import torch_online_softmax +from reference.torch_row_softmax import torch_row_softmax +from reference.torch_vector_add import torch_vector_add + + +def test_vector_add_matches_torch(): + x = torch.randn(257) + y = torch.randn(257) + torch.testing.assert_close(torch_vector_add(x, y), x + y) + + +def test_row_softmax_matches_torch(): + x = torch.randn(32, 65) + torch.testing.assert_close(torch_row_softmax(x), torch.softmax(x, dim=1)) + + +def test_matmul_matches_torch(): + a = torch.randn(16, 24) + b = torch.randn(24, 8) + torch.testing.assert_close(torch_matmul(a, b), a @ b) + + +def test_online_softmax_matches_torch(): + x = torch.randn(12, 33) + torch.testing.assert_close( + torch_online_softmax(x), torch.softmax(x, dim=1), atol=1e-5, rtol=1e-5 + ) + + +def test_attention_matches_manual_formula(): + q = torch.randn(1, 2, 8, 16) + k = torch.randn(1, 2, 8, 16) + v = torch.randn(1, 2, 8, 16) + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(q.shape[-1]) + expected = torch.matmul(torch.softmax(scores, dim=-1), v) + torch.testing.assert_close(torch_attention(q, k, v), expected, atol=1e-5, rtol=1e-5) + diff --git a/tests/test_extension_import.py b/tests/test_extension_import.py new file mode 100644 index 0000000..759aa3a --- /dev/null +++ b/tests/test_extension_import.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +import pytest +import torch + +from tools.lab_extension import build_extension + + +@pytest.mark.cuda_required +@pytest.mark.skeleton +def test_extension_can_build_or_skip(): + if not torch.cuda.is_available(): + pytest.skip("CUDA is not available") + ext = build_extension(verbose=False) + if ext is None: + pytest.skip("extension build/load is unavailable in this environment") + assert hasattr(torch.ops, "kernel_lab") + assert hasattr(torch.ops.kernel_lab, "vector_add") diff --git a/tests/test_numerics.py b/tests/test_numerics.py new file mode 100644 index 0000000..dd3911e --- /dev/null +++ b/tests/test_numerics.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +import torch + +from reference.torch_online_softmax import torch_online_softmax +from reference.torch_row_softmax import torch_row_softmax + + +def test_row_softmax_handles_large_values(): + x = torch.tensor([[10000.0, 10001.0, 9999.0]], dtype=torch.float32) + out = torch_row_softmax(x) + torch.testing.assert_close(out.sum(dim=1), torch.ones(1), atol=1e-6, rtol=1e-6) + assert torch.isfinite(out).all() + + +def test_online_softmax_handles_large_negative_values(): + x = torch.tensor([[-10000.0, -9998.0, -9999.0]], dtype=torch.float32) + out = torch_online_softmax(x) + torch.testing.assert_close(out.sum(dim=1), torch.ones(1), atol=1e-6, rtol=1e-6) + assert torch.isfinite(out).all() + + +def test_row_and_online_softmax_agree(): + x = torch.randn(10, 40) * 8.0 + torch.testing.assert_close( + torch_row_softmax(x), torch_online_softmax(x), atol=1e-5, rtol=1e-5 + ) + diff --git a/tests/test_shapes.py b/tests/test_shapes.py new file mode 100644 index 0000000..6691882 --- /dev/null +++ b/tests/test_shapes.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +import torch + +from reference.torch_attention import torch_attention +from reference.torch_matmul import torch_matmul +from reference.torch_online_softmax import torch_online_softmax +from reference.torch_row_softmax import torch_row_softmax +from reference.torch_vector_add import torch_vector_add + + +def test_vector_add_shape(): + x = torch.randn(11) + y = torch.randn(11) + assert torch_vector_add(x, y).shape == x.shape + + +def test_row_softmax_shape(): + x = torch.randn(4, 9) + assert torch_row_softmax(x).shape == x.shape + + +def test_matmul_shape(): + a = torch.randn(5, 7) + b = torch.randn(7, 3) + assert torch_matmul(a, b).shape == (5, 3) + + +def test_online_softmax_shape(): + x = torch.randn(3, 13) + assert torch_online_softmax(x).shape == x.shape + + +def test_attention_shape(): + q = torch.randn(2, 4, 8, 16) + k = torch.randn(2, 4, 8, 16) + v = torch.randn(2, 4, 8, 16) + assert torch_attention(q, k, v).shape == q.shape + diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 0000000..eecb0de --- /dev/null +++ b/tools/__init__.py @@ -0,0 +1,2 @@ +"""Helper modules and scripts for the lab.""" + diff --git a/tools/check_env.py b/tools/check_env.py new file mode 100644 index 0000000..b0c0686 --- /dev/null +++ b/tools/check_env.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import platform +import shutil +import subprocess + +import torch + + +def run_command(cmd: list[str]) -> str: + if shutil.which(cmd[0]) is None: + return "not found" + try: + result = subprocess.run(cmd, capture_output=True, text=True, check=False) + text = (result.stdout or result.stderr).strip() + return text or f"command exited with code {result.returncode}" + except Exception as exc: # pragma: no cover - defensive + return f"error: {exc}" + + +def main() -> None: + print("=== System ===") + print("python:", platform.python_version()) + print("platform:", platform.platform()) + + print("\n=== PyTorch ===") + print("torch:", torch.__version__) + print("torch.cuda.is_available():", torch.cuda.is_available()) + print("torch.version.cuda:", torch.version.cuda) + + if torch.cuda.is_available(): + device_count = torch.cuda.device_count() + print("cuda device count:", device_count) + for idx in range(device_count): + name = torch.cuda.get_device_name(idx) + capability = torch.cuda.get_device_capability(idx) + print(f"device {idx}: {name} | capability={capability[0]}.{capability[1]}") + else: + print("no CUDA device visible to PyTorch") + + print("\n=== Triton ===") + try: + import triton # type: ignore + + print("triton:", triton.__version__) + except Exception as exc: + print("triton import failed:", exc) + + print("\n=== Toolkit / Driver Hints ===") + print("nvcc --version:") + print(run_command(["nvcc", "--version"])) + print("\nnvidia-smi:") + print(run_command(["nvidia-smi"])) + + +if __name__ == "__main__": + main() + diff --git a/tools/compare_against_torch.py b/tools/compare_against_torch.py new file mode 100644 index 0000000..554fba7 --- /dev/null +++ b/tools/compare_against_torch.py @@ -0,0 +1,138 @@ +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +import torch + +from kernels.triton.flash_attention_fwd import triton_flash_attention_fwd +from kernels.triton.online_softmax import triton_online_softmax +from kernels.triton.row_softmax import triton_row_softmax +from kernels.triton.tiled_matmul import triton_tiled_matmul +from kernels.triton.vector_add import triton_vector_add +from reference.torch_attention import torch_attention +from reference.torch_matmul import torch_matmul +from reference.torch_online_softmax import torch_online_softmax +from reference.torch_row_softmax import torch_row_softmax +from reference.torch_vector_add import torch_vector_add +from tools.lab_extension import build_extension + + +def compare_vector_add(device: str) -> None: + x = torch.randn(4097, device=device) + y = torch.randn(4097, device=device) + ref = torch_vector_add(x, y) + print("torch reference ready") + try: + torch.testing.assert_close(triton_vector_add(x, y), ref) + print("triton matches torch") + except Exception as exc: + print(f"triton unavailable: {exc}") + ext = build_extension(verbose=False) if device == "cuda" else None + if ext is not None and hasattr(torch.ops, "kernel_lab"): + try: + torch.testing.assert_close(torch.ops.kernel_lab.vector_add(x, y), ref) + print("cuda op matches torch") + except Exception as exc: + print(f"cuda op unavailable: {exc}") + + +def compare_softmax(device: str, variant: str) -> None: + x = torch.randn(128, 257, device=device) + ref = torch_row_softmax(x) if variant == "row" else torch_online_softmax(x) + print("torch reference ready") + triton_fn = triton_row_softmax if variant == "row" else triton_online_softmax + try: + torch.testing.assert_close(triton_fn(x), ref, atol=1e-4, rtol=1e-4) + print("triton matches torch") + except Exception as exc: + print(f"triton unavailable: {exc}") + ext = build_extension(verbose=False) if device == "cuda" else None + op_name = "row_softmax" if variant == "row" else "online_softmax" + if ext is not None and hasattr(torch.ops, "kernel_lab"): + try: + torch.testing.assert_close( + getattr(torch.ops.kernel_lab, op_name)(x), ref, atol=1e-4, rtol=1e-4 + ) + print("cuda op matches torch") + except Exception as exc: + print(f"cuda op unavailable: {exc}") + + +def compare_matmul(device: str) -> None: + a = torch.randn(64, 96, device=device) + b = torch.randn(96, 48, device=device) + ref = torch_matmul(a, b) + print("torch reference ready") + try: + torch.testing.assert_close(triton_tiled_matmul(a, b), ref, atol=1e-3, rtol=1e-3) + print("triton matches torch") + except Exception as exc: + print(f"triton unavailable: {exc}") + ext = build_extension(verbose=False) if device == "cuda" else None + if ext is not None and hasattr(torch.ops, "kernel_lab"): + try: + torch.testing.assert_close( + torch.ops.kernel_lab.tiled_matmul(a, b), ref, atol=1e-3, rtol=1e-3 + ) + print("cuda op matches torch") + except Exception as exc: + print(f"cuda op unavailable: {exc}") + + +def compare_attention(device: str) -> None: + q = torch.randn(1, 2, 16, 32, device=device) + k = torch.randn(1, 2, 16, 32, device=device) + v = torch.randn(1, 2, 16, 32, device=device) + ref = torch_attention(q, k, v, causal=False) + print("torch reference ready") + try: + torch.testing.assert_close( + triton_flash_attention_fwd(q, k, v, causal=False), ref, atol=2e-3, rtol=2e-3 + ) + print("triton matches torch") + except Exception as exc: + print(f"triton unavailable: {exc}") + ext = build_extension(verbose=False) if device == "cuda" else None + if ext is not None and hasattr(torch.ops, "kernel_lab"): + try: + torch.testing.assert_close( + torch.ops.kernel_lab.flash_attention_fwd(q, k, v, False), + ref, + atol=2e-3, + rtol=2e-3, + ) + print("cuda op matches torch") + except Exception as exc: + print(f"cuda op unavailable: {exc}") + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "--task", + choices=["vector_add", "row_softmax", "online_softmax", "matmul", "attention"], + required=True, + ) + parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu") + args = parser.parse_args() + + if args.task == "vector_add": + compare_vector_add(args.device) + elif args.task == "row_softmax": + compare_softmax(args.device, "row") + elif args.task == "online_softmax": + compare_softmax(args.device, "online") + elif args.task == "matmul": + compare_matmul(args.device) + else: + compare_attention(args.device) + + +if __name__ == "__main__": + main() diff --git a/tools/lab_extension.py b/tools/lab_extension.py new file mode 100644 index 0000000..09f6518 --- /dev/null +++ b/tools/lab_extension.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +import os +from pathlib import Path +from typing import Optional + +import torch + +try: + from torch.utils.cpp_extension import load +except ImportError: # pragma: no cover - depends on torch install + load = None + + +ROOT = Path(__file__).resolve().parents[1] +CUDA_DIR = ROOT / "kernels" / "cuda" + + +def _format_torch_cuda_arch(raw_arch: str) -> str: + if raw_arch.isdigit() and len(raw_arch) == 3: + return f"{raw_arch[:2]}.{raw_arch[2]}" + return raw_arch + + +def build_extension(verbose: bool = True) -> Optional[object]: + """Build or load the lab extension if the local environment allows it.""" + if load is None: + print("torch.utils.cpp_extension.load is unavailable in this PyTorch build.") + return None + if not torch.cuda.is_available(): + print("CUDA is not available; skipping extension build.") + return None + + arch = _format_torch_cuda_arch(os.environ.get("KERNEL_LAB_CUDA_ARCH", "120")) + os.environ.setdefault("TORCH_CUDA_ARCH_LIST", arch) + + sources = [ + str(CUDA_DIR / "binding" / "binding.cpp"), + str(CUDA_DIR / "src" / "vector_add.cu"), + str(CUDA_DIR / "src" / "row_softmax.cu"), + str(CUDA_DIR / "src" / "tiled_matmul.cu"), + str(CUDA_DIR / "src" / "online_softmax.cu"), + str(CUDA_DIR / "src" / "flash_attention_fwd.cu"), + ] + + try: + return load( + name="kernel_lab_ext", + sources=sources, + extra_include_paths=[str(CUDA_DIR / "include")], + extra_cflags=["-O0", "-std=c++17"], + extra_cuda_cflags=["-O0", "-lineinfo"], + verbose=verbose, + ) + except Exception as exc: # pragma: no cover - environment-dependent + print(f"Extension build/load failed: {exc}") + return None diff --git a/tools/print_device_info.py b/tools/print_device_info.py new file mode 100644 index 0000000..d6fc737 --- /dev/null +++ b/tools/print_device_info.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +import torch + + +def main() -> None: + if not torch.cuda.is_available(): + print("CUDA is not available.") + return + + for idx in range(torch.cuda.device_count()): + props = torch.cuda.get_device_properties(idx) + print(f"device {idx}: {props.name}") + print(f" capability: {props.major}.{props.minor}") + print(f" total memory (GB): {props.total_memory / 1e9:.2f}") + print(f" multiprocessors: {props.multi_processor_count}") + print(f" max threads per block: {props.max_threads_per_block}") + print(f" warp size: {props.warp_size}") + + +if __name__ == "__main__": + main() + diff --git a/tools/profile_ncu.sh b/tools/profile_ncu.sh new file mode 100755 index 0000000..2e00e0f --- /dev/null +++ b/tools/profile_ncu.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [[ $# -eq 0 ]]; then + echo "usage: $0 " + exit 1 +fi + +ncu --set full --target-processes all "$@" + diff --git a/tools/profile_nsys.sh b/tools/profile_nsys.sh new file mode 100755 index 0000000..ac45853 --- /dev/null +++ b/tools/profile_nsys.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [[ $# -eq 0 ]]; then + echo "usage: $0 " + exit 1 +fi + +mkdir -p profile-output +nsys profile --trace=cuda,nvtx,osrt --sample=none -o profile-output/profile "$@" + diff --git a/tools/run_all_benchmarks.sh b/tools/run_all_benchmarks.sh new file mode 100755 index 0000000..957bd38 --- /dev/null +++ b/tools/run_all_benchmarks.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +set -euo pipefail + +python bench/bench_vector_add.py "$@" +python bench/bench_softmax.py "$@" +python bench/bench_matmul.py "$@" +python bench/bench_attention.py "$@" + diff --git a/tools/run_all_tests.sh b/tools/run_all_tests.sh new file mode 100755 index 0000000..7a84c11 --- /dev/null +++ b/tools/run_all_tests.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +set -euo pipefail + +pytest -q + diff --git a/uv.lock b/uv.lock new file mode 100644 index 0000000..82d736f --- /dev/null +++ b/uv.lock @@ -0,0 +1,832 @@ +version = 1 +revision = 3 +requires-python = ">=3.10" +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version < '3.11'", +] + +[[package]] +name = "cmake" +version = "4.3.1" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/40/72/c295e190193af5f41d583517db1ca1cf43eaa2af8140856dca114fa6486d/cmake-4.3.1.tar.gz", hash = "sha256:6fe523413cdd2568a19a6ec297b8f869a95a3f8edaf0dd73731b81412216e00e" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/43/07/05be57c389f8f0c3d0d7b878685ec1eed863b77787d65584c9399e294531/cmake-4.3.1-py3-none-macosx_10_10_universal2.whl", hash = "sha256:976337df534f4eea6b100a7af39f9a7a538aa5fd65b7d770cf2a07907439dca8" }, + { url = "https://mirrors.aliyun.com/pypi/packages/9b/80/09c381519855d53a5cce0e5e10e184f9e89caf6a6f1f7d7b42c17bd68d2e/cmake-4.3.1-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bb7b7ef74beb69c099c39d7f151cfc94256bba1b75354e48ea87d6bf0dcb3007" }, + { url = "https://mirrors.aliyun.com/pypi/packages/d6/0d/eab407c3592442711584d09bef5de17df93f39ea69baaa310c4564436177/cmake-4.3.1-py3-none-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:8874aac9fbd07d71c506fcaf57255ff2cc015a15ea44146c0d1e694843d5e312" }, + { url = "https://mirrors.aliyun.com/pypi/packages/ed/de/c7f487b21b33918c0af1dabfdf8d858799e01d62c2bd139fc871b86b21a2/cmake-4.3.1-py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f439501f3f3ecf1dbafbee6226fc6cc680203202ddfe59586357d076c417ae8f" }, + { url = "https://mirrors.aliyun.com/pypi/packages/c4/9c/2de3a3b5a5983b72c3e2eeaa23a6c8d251ebae79d15cedb9818e708a4caf/cmake-4.3.1-py3-none-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:345f5e4ee783cd7691c6b54a631b43bb0c2efabf45afa64ac9000f9b0885d250" }, + { url = "https://mirrors.aliyun.com/pypi/packages/87/32/542ed40b3393bced9af073402f75fb6ae3e57d6656cc38a9470942de7b8c/cmake-4.3.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:80fc99c1958dbd926f529448dfcdbb1b176ecf31a8d485a3d68bd469487e8933" }, + { url = "https://mirrors.aliyun.com/pypi/packages/7f/ec/4f99b7414984d668aaa9c6214df84af689db8756f1536ea81720bb2fef91/cmake-4.3.1-py3-none-manylinux_2_31_armv7l.whl", hash = "sha256:fdb744921ff4739e755faee005b36b97223d2bb8591ebd7b57abb5cf97300925" }, + { url = "https://mirrors.aliyun.com/pypi/packages/bf/b4/627e18c8acf6219b1c8c521e7d702bca36edab21994992b64e68e1007430/cmake-4.3.1-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:fdc39eb421177bfa946af3600c797612ee76bfe6daa6036ff8958c504a99937b" }, + { url = "https://mirrors.aliyun.com/pypi/packages/cc/e1/e3b3dd1c81e72329f1ec3350a02154f74547eab2c1db8270eb2bc344edc1/cmake-4.3.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:8d1d224f9df9e82f154ad31b9798b7b4c0d509a11ccdca695d0ee4d140c30c6d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/98/3e/7a1ca1992de268fe1284b3738549ea8859f665dec294d584ae9200d66ce6/cmake-4.3.1-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:056483831febe0934f25959bc74da077b18f23c7a064a0417432447ec27b8fb2" }, + { url = "https://mirrors.aliyun.com/pypi/packages/16/ce/9b0e79835b674d2680fae7e57d3229abac0765c3d80ddcc6c5c67ec78fd8/cmake-4.3.1-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9567aac26ee7a0594e6b71a8f94e907c7ce957cdeedeca1404504228f2c9885f" }, + { url = "https://mirrors.aliyun.com/pypi/packages/bf/3a/eb15faf1ce961431017cda585650bd37a259fe53e836eae34263ea3647ab/cmake-4.3.1-py3-none-musllinux_1_2_ppc64le.whl", hash = "sha256:8cadd35606d3e4f9a43173f7236cd8947fb9652fdfb272a916f0600ad169fbf2" }, + { url = "https://mirrors.aliyun.com/pypi/packages/48/34/b76e2c7aa0c1aa833430a2b4a1caff3f4163b9db7c38d208a4b6e0287d54/cmake-4.3.1-py3-none-musllinux_1_2_riscv64.whl", hash = "sha256:75622c1e7266e60fab9d4f1f4b4dc25f22a4b57e902d9792c39538f46a997269" }, + { url = "https://mirrors.aliyun.com/pypi/packages/3a/3b/afd7d51aabec951e8881812d9bf1d4c74c1882434e97eb9ed6f097591dd1/cmake-4.3.1-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:e43acaa51bb8fe57a914424edb0efa91eb82d577fb74ecc6ff67da47a1d23524" }, + { url = "https://mirrors.aliyun.com/pypi/packages/4f/be/6d9c4f0ef5383622c3d7fd508acb531b1cbaee530e7cf4196c415f548131/cmake-4.3.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:e8bc1c42517ed9cb26eb293e720449830940ef7be6dcc101638b9cc65ece98c8" }, + { url = "https://mirrors.aliyun.com/pypi/packages/3a/49/ca3671c1b6859b5efe5b9f1bebf95cad823d48a3a8f366a72e207a1d7a02/cmake-4.3.1-py3-none-win32.whl", hash = "sha256:cd9058d730da5fa68394c41b26036b18850de494d730a0a85cde51558138b70b" }, + { url = "https://mirrors.aliyun.com/pypi/packages/2a/2a/a8db7d73a6941e9d4a177011137378b222cd6dcec383f1998f3594c73a0c/cmake-4.3.1-py3-none-win_amd64.whl", hash = "sha256:73fb3851fe760b0395983c5d3dd6da2364b1ce324f8546aee2078d162d96005a" }, + { url = "https://mirrors.aliyun.com/pypi/packages/9b/be/d62156e55b8a9614f29cd8e576e9bf925ee2b428e43bb0c4f02b5cb97c65/cmake-4.3.1-py3-none-win_arm64.whl", hash = "sha256:86e97fed7c9a61638b08937981fdc9bca9caec9df9c88b87aa0a47442583e02a" }, +] + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6" }, +] + +[[package]] +name = "cuda-bindings" +version = "12.9.6" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +dependencies = [ + { name = "cuda-pathfinder" }, +] +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/08/9d/dd87e1071bcb2e438c14e2e4497aa0037faf2c9775ac1d172f578f448668/cuda_bindings-12.9.6-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bb2f1eedc8f65902b34e807c21a3b7c922dc8de1f51d0829ecbb5c6a5e9c5ff1" }, + { url = "https://mirrors.aliyun.com/pypi/packages/8c/1d/5631df2faa5e5f6bd3e8fef098d6fc1b7c6f38811821332ef28ad82ce0d4/cuda_bindings-12.9.6-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d9f9031e7a265e74f1517668139987253552d1677d995da4b0d990aa19b9b9b0" }, + { url = "https://mirrors.aliyun.com/pypi/packages/1f/a5/e9d37c10f6c27c9c65d53c6cd6d9763e1df99c004780585fc2ad9041fbe3/cuda_bindings-12.9.6-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2662f59db67d9aeaf8959c593c91f600792c2970cf02cae2814387fc687b115a" }, + { url = "https://mirrors.aliyun.com/pypi/packages/66/d5/bd4c03e9516d3cf788a270debe28d687e5c48b13a9931599bbddf01de302/cuda_bindings-12.9.6-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8519707644ea630a365b101703a9136f4cb144760cc2c73281c38a05e07d08d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/50/04/8a4d45dc154a8a32982658cc55be291e9778d1197834b15d33427e2f65c1/cuda_bindings-12.9.6-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ea331bc47d9988cc61f0ecc5fa8df9dd188b4493ae1c6688bb1ee8ce8ba1af4" }, + { url = "https://mirrors.aliyun.com/pypi/packages/3b/69/4b0375e1b120dfa7427c31c8420cfdee596ecd03955fd291a96116fa375d/cuda_bindings-12.9.6-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b2b54b95a47104eff56b5155818ab5790e3ccdba8dd51e2928ae56782aaf5b02" }, + { url = "https://mirrors.aliyun.com/pypi/packages/dd/ad/2d9b80c28deae971ce4bbe991c23b81347a2a8918b2672020d07f070a596/cuda_bindings-12.9.6-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da30d89db8188b9beb5a6467d72b2f11d1b667ab901d2d373bcde51b97765b21" }, + { url = "https://mirrors.aliyun.com/pypi/packages/b2/ca/729781d11445cfbacd1af1bf0edfe147c311212cfdf1d5c292e0565fabef/cuda_bindings-12.9.6-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3d1be8bd80b34f51dcbaf138dafd817e888cf2d12c47833019fd933beb32d7ef" }, + { url = "https://mirrors.aliyun.com/pypi/packages/fe/f3/51768221aade33e711dcf7e4a52fdc0d0446c1baf39f6bcc9d69cfbceb0b/cuda_bindings-12.9.6-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:48666e666f083a4c4387ffe20594b05e092b535a4453d1e4817d71237d02aa13" }, + { url = "https://mirrors.aliyun.com/pypi/packages/71/34/14afff4aabe3b5bd84c647dea4a4dfb917c94b8a8df0adb6b1622c2b465b/cuda_bindings-12.9.6-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b4f82f8f8061f3a39446bf854c4edd9bcc2d0da3f58d8f6f54541b3e4d5c933d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/3d/d3/a29faf4fb371c2f43ffda23a938ec0bebf6dbab676350e137ae0f61e5ec0/cuda_bindings-12.9.6-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f00290f9468d2cfeee92aaad2275be32dfd2f4967a97ac0f12314b7e6281ad78" }, + { url = "https://mirrors.aliyun.com/pypi/packages/2a/97/71e66b2ed65d80f7b70a1538af72d73cd798e22bc93d240d7e69f2366322/cuda_bindings-12.9.6-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d3bc6e28cf5d133f72050c515db72876870fb009f1431bcbf45b54a179be2284" }, + { url = "https://mirrors.aliyun.com/pypi/packages/49/91/c10b575a001aad39c036efd649869aac8d97ef0ba9f1d8ad17b4946b3366/cuda_bindings-12.9.6-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e88d38fdf07cc777dec1afaba8139c2eedb3819063f6b42f1e2ea8516bdd6806" }, + { url = "https://mirrors.aliyun.com/pypi/packages/2a/9a/998471e76bea78e96d3d7fdf0bc5f46c3210858e81e6d13d8186a9dbb636/cuda_bindings-12.9.6-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4df01e34cefd3275170b2ac0426d325271ab435e85f59a69300eacd8ff23d34c" }, +] + +[[package]] +name = "cuda-pathfinder" +version = "1.5.2" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/f2/f9/1b9b60a30fc463c14cdea7a77228131a0ccc89572e8df9cb86c9648271ab/cuda_pathfinder-1.5.2-py3-none-any.whl", hash = "sha256:0c5f160a7756c5b072723cbbd6d861e38917ef956c68150b02f0b6e9271c71fa" }, +] + +[[package]] +name = "cuda-toolkit" +version = "12.8.1" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/d4/c8/7dce3a0b15b42a3b58e7d96eb22a687d3bf2c44e01d149a6874629cd9938/cuda_toolkit-12.8.1-py2.py3-none-any.whl", hash = "sha256:adc7906af4ecbf9a352f9dca5734eceb21daec281ccfcf5675e1d2f724fc2cba" }, +] + +[package.optional-dependencies] +cublas = [ + { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] +cudart = [ + { name = "nvidia-cuda-runtime-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] +cufft = [ + { name = "nvidia-cufft-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] +cufile = [ + { name = "nvidia-cufile-cu12", marker = "sys_platform == 'linux'" }, +] +cupti = [ + { name = "nvidia-cuda-cupti-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] +curand = [ + { name = "nvidia-curand-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] +cusolver = [ + { name = "nvidia-cusolver-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] +cusparse = [ + { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] +nvjitlink = [ + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] +nvrtc = [ + { name = "nvidia-cuda-nvrtc-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] +nvtx = [ + { name = "nvidia-nvtx-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] + +[[package]] +name = "exceptiongroup" +version = "1.3.1" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +dependencies = [ + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/8a/0e/97c33bf5009bdbac74fd2beace167cab3f978feb69cc36f1ef79360d6c4e/exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598" }, +] + +[[package]] +name = "filelock" +version = "3.25.2" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/94/b8/00651a0f559862f3bb7d6f7477b192afe3f583cc5e26403b44e59a55ab34/filelock-3.25.2.tar.gz", hash = "sha256:b64ece2b38f4ca29dd3e810287aa8c48182bbecd1ae6e9ae126c9b35f1382694" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/a4/a5/842ae8f0c08b61d6484b52f99a03510a3a72d23141942d216ebe81fefbce/filelock-3.25.2-py3-none-any.whl", hash = "sha256:ca8afb0da15f229774c9ad1b455ed96e85a81373065fb10446672f64444ddf70" }, +] + +[[package]] +name = "fsspec" +version = "2026.3.0" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/e1/cf/b50ddf667c15276a9ab15a70ef5f257564de271957933ffea49d2cdbcdfb/fsspec-2026.3.0.tar.gz", hash = "sha256:1ee6a0e28677557f8c2f994e3eea77db6392b4de9cd1f5d7a9e87a0ae9d01b41" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/d5/1f/5f4a3cd9e4440e9d9bc78ad0a91a1c8d46b4d429d5239ebe6793c9fe5c41/fsspec-2026.3.0-py3-none-any.whl", hash = "sha256:d2ceafaad1b3457968ed14efa28798162f1638dbb5d2a6868a2db002a5ee39a4" }, +] + +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12" }, +] + +[[package]] +name = "jinja2" +version = "3.1.6" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67" }, +] + +[[package]] +name = "kernel-lab" +version = "0.1.0" +source = { editable = "." } +dependencies = [ + { name = "cmake" }, + { name = "ninja" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.4.4", source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }, marker = "python_full_version >= '3.11'" }, + { name = "packaging" }, + { name = "pytest" }, + { name = "torch" }, + { name = "triton" }, +] + +[package.metadata] +requires-dist = [ + { name = "cmake", specifier = ">=3.25" }, + { name = "ninja", specifier = ">=1.11" }, + { name = "numpy", specifier = ">=1.26" }, + { name = "packaging", specifier = ">=24.0" }, + { name = "pytest", specifier = ">=8.0" }, + { name = "torch", specifier = ">=2.10", index = "https://download.pytorch.org/whl/cu128" }, + { name = "triton", specifier = ">=3.0" }, +] + +[[package]] +name = "markupsafe" +version = "3.0.3" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/e8/4b/3541d44f3937ba468b75da9eebcae497dcf67adb65caa16760b0a6807ebb/markupsafe-3.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f981d352f04553a7171b8e44369f2af4055f888dfb147d55e42d29e29e74559" }, + { url = "https://mirrors.aliyun.com/pypi/packages/98/1b/fbd8eed11021cabd9226c37342fa6ca4e8a98d8188a8d9b66740494960e4/markupsafe-3.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e1c1493fb6e50ab01d20a22826e57520f1284df32f2d8601fdd90b6304601419" }, + { url = "https://mirrors.aliyun.com/pypi/packages/40/01/e560d658dc0bb8ab762670ece35281dec7b6c1b33f5fbc09ebb57a185519/markupsafe-3.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1ba88449deb3de88bd40044603fafffb7bc2b055d626a330323a9ed736661695" }, + { url = "https://mirrors.aliyun.com/pypi/packages/af/cd/ce6e848bbf2c32314c9b237839119c5a564a59725b53157c856e90937b7a/markupsafe-3.0.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f42d0984e947b8adf7dd6dde396e720934d12c506ce84eea8476409563607591" }, + { url = "https://mirrors.aliyun.com/pypi/packages/c9/2a/b5c12c809f1c3045c4d580b035a743d12fcde53cf685dbc44660826308da/markupsafe-3.0.3-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c0c0b3ade1c0b13b936d7970b1d37a57acde9199dc2aecc4c336773e1d86049c" }, + { url = "https://mirrors.aliyun.com/pypi/packages/cf/e3/9427a68c82728d0a88c50f890d0fc072a1484de2f3ac1ad0bfc1a7214fd5/markupsafe-3.0.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0303439a41979d9e74d18ff5e2dd8c43ed6c6001fd40e5bf2e43f7bd9bbc523f" }, + { url = "https://mirrors.aliyun.com/pypi/packages/bc/36/23578f29e9e582a4d0278e009b38081dbe363c5e7165113fad546918a232/markupsafe-3.0.3-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:d2ee202e79d8ed691ceebae8e0486bd9a2cd4794cec4824e1c99b6f5009502f6" }, + { url = "https://mirrors.aliyun.com/pypi/packages/56/21/dca11354e756ebd03e036bd8ad58d6d7168c80ce1fe5e75218e4945cbab7/markupsafe-3.0.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:177b5253b2834fe3678cb4a5f0059808258584c559193998be2601324fdeafb1" }, + { url = "https://mirrors.aliyun.com/pypi/packages/87/99/faba9369a7ad6e4d10b6a5fbf71fa2a188fe4a593b15f0963b73859a1bbd/markupsafe-3.0.3-cp310-cp310-win32.whl", hash = "sha256:2a15a08b17dd94c53a1da0438822d70ebcd13f8c3a95abe3a9ef9f11a94830aa" }, + { url = "https://mirrors.aliyun.com/pypi/packages/d6/25/55dc3ab959917602c96985cb1253efaa4ff42f71194bddeb61eb7278b8be/markupsafe-3.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:c4ffb7ebf07cfe8931028e3e4c85f0357459a3f9f9490886198848f4fa002ec8" }, + { url = "https://mirrors.aliyun.com/pypi/packages/d0/9e/0a02226640c255d1da0b8d12e24ac2aa6734da68bff14c05dd53b94a0fc3/markupsafe-3.0.3-cp310-cp310-win_arm64.whl", hash = "sha256:e2103a929dfa2fcaf9bb4e7c091983a49c9ac3b19c9061b6d5427dd7d14d81a1" }, + { url = "https://mirrors.aliyun.com/pypi/packages/08/db/fefacb2136439fc8dd20e797950e749aa1f4997ed584c62cfb8ef7c2be0e/markupsafe-3.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cc7ea17a6824959616c525620e387f6dd30fec8cb44f649e31712db02123dad" }, + { url = "https://mirrors.aliyun.com/pypi/packages/e1/2e/5898933336b61975ce9dc04decbc0a7f2fee78c30353c5efba7f2d6ff27a/markupsafe-3.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bd4cd07944443f5a265608cc6aab442e4f74dff8088b0dfc8238647b8f6ae9a" }, + { url = "https://mirrors.aliyun.com/pypi/packages/1d/09/adf2df3699d87d1d8184038df46a9c80d78c0148492323f4693df54e17bb/markupsafe-3.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b5420a1d9450023228968e7e6a9ce57f65d148ab56d2313fcd589eee96a7a50" }, + { url = "https://mirrors.aliyun.com/pypi/packages/30/ac/0273f6fcb5f42e314c6d8cd99effae6a5354604d461b8d392b5ec9530a54/markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bf2a864d67e76e5c9a34dc26ec616a66b9888e25e7b9460e1c76d3293bd9dbf" }, + { url = "https://mirrors.aliyun.com/pypi/packages/19/ae/31c1be199ef767124c042c6c3e904da327a2f7f0cd63a0337e1eca2967a8/markupsafe-3.0.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc51efed119bc9cfdf792cdeaa4d67e8f6fcccab66ed4bfdd6bde3e59bfcbb2f" }, + { url = "https://mirrors.aliyun.com/pypi/packages/b2/76/7edcab99d5349a4532a459e1fe64f0b0467a3365056ae550d3bcf3f79e1e/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a" }, + { url = "https://mirrors.aliyun.com/pypi/packages/a4/28/6e74cdd26d7514849143d69f0bf2399f929c37dc2b31e6829fd2045b2765/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7be7b61bb172e1ed687f1754f8e7484f1c8019780f6f6b0786e76bb01c2ae115" }, + { url = "https://mirrors.aliyun.com/pypi/packages/62/7e/a145f36a5c2945673e590850a6f8014318d5577ed7e5920a4b3448e0865d/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9e130248f4462aaa8e2552d547f36ddadbeaa573879158d721bbd33dfe4743a" }, + { url = "https://mirrors.aliyun.com/pypi/packages/0f/62/d9c46a7f5c9adbeeeda52f5b8d802e1094e9717705a645efc71b0913a0a8/markupsafe-3.0.3-cp311-cp311-win32.whl", hash = "sha256:0db14f5dafddbb6d9208827849fad01f1a2609380add406671a26386cdf15a19" }, + { url = "https://mirrors.aliyun.com/pypi/packages/83/8a/4414c03d3f891739326e1783338e48fb49781cc915b2e0ee052aa490d586/markupsafe-3.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:de8a88e63464af587c950061a5e6a67d3632e36df62b986892331d4620a35c01" }, + { url = "https://mirrors.aliyun.com/pypi/packages/35/73/893072b42e6862f319b5207adc9ae06070f095b358655f077f69a35601f0/markupsafe-3.0.3-cp311-cp311-win_arm64.whl", hash = "sha256:3b562dd9e9ea93f13d53989d23a7e775fdfd1066c33494ff43f5418bc8c58a5c" }, + { url = "https://mirrors.aliyun.com/pypi/packages/5a/72/147da192e38635ada20e0a2e1a51cf8823d2119ce8883f7053879c2199b5/markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e" }, + { url = "https://mirrors.aliyun.com/pypi/packages/9a/81/7e4e08678a1f98521201c3079f77db69fb552acd56067661f8c2f534a718/markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce" }, + { url = "https://mirrors.aliyun.com/pypi/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a" }, + { url = "https://mirrors.aliyun.com/pypi/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b" }, + { url = "https://mirrors.aliyun.com/pypi/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f" }, + { url = "https://mirrors.aliyun.com/pypi/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b" }, + { url = "https://mirrors.aliyun.com/pypi/packages/2f/e1/78ee7a023dac597a5825441ebd17170785a9dab23de95d2c7508ade94e0e/markupsafe-3.0.3-cp312-cp312-win32.whl", hash = "sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/aa/5b/bec5aa9bbbb2c946ca2733ef9c4ca91c91b6a24580193e891b5f7dbe8e1e/markupsafe-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c" }, + { url = "https://mirrors.aliyun.com/pypi/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f" }, + { url = "https://mirrors.aliyun.com/pypi/packages/38/2f/907b9c7bbba283e68f20259574b13d005c121a0fa4c175f9bed27c4597ff/markupsafe-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e1cf1972137e83c5d4c136c43ced9ac51d0e124706ee1c8aa8532c1287fa8795" }, + { url = "https://mirrors.aliyun.com/pypi/packages/9c/d9/5f7756922cdd676869eca1c4e3c0cd0df60ed30199ffd775e319089cb3ed/markupsafe-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:116bb52f642a37c115f517494ea5feb03889e04df47eeff5b130b1808ce7c219" }, + { url = "https://mirrors.aliyun.com/pypi/packages/00/07/575a68c754943058c78f30db02ee03a64b3c638586fba6a6dd56830b30a3/markupsafe-3.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:133a43e73a802c5562be9bbcd03d090aa5a1fe899db609c29e8c8d815c5f6de6" }, + { url = "https://mirrors.aliyun.com/pypi/packages/a9/21/9b05698b46f218fc0e118e1f8168395c65c8a2c750ae2bab54fc4bd4e0e8/markupsafe-3.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfcd093f13f0f0b7fdd0f198b90053bf7b2f02a3927a30e63f3ccc9df56b676" }, + { url = "https://mirrors.aliyun.com/pypi/packages/7f/71/544260864f893f18b6827315b988c146b559391e6e7e8f7252839b1b846a/markupsafe-3.0.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:509fa21c6deb7a7a273d629cf5ec029bc209d1a51178615ddf718f5918992ab9" }, + { url = "https://mirrors.aliyun.com/pypi/packages/c2/28/b50fc2f74d1ad761af2f5dcce7492648b983d00a65b8c0e0cb457c82ebbe/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4afe79fb3de0b7097d81da19090f4df4f8d3a2b3adaa8764138aac2e44f3af1" }, + { url = "https://mirrors.aliyun.com/pypi/packages/ed/76/104b2aa106a208da8b17a2fb72e033a5a9d7073c68f7e508b94916ed47a9/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:795e7751525cae078558e679d646ae45574b47ed6e7771863fcc079a6171a0fc" }, + { url = "https://mirrors.aliyun.com/pypi/packages/b5/99/16a5eb2d140087ebd97180d95249b00a03aa87e29cc224056274f2e45fd6/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8485f406a96febb5140bfeca44a73e3ce5116b2501ac54fe953e488fb1d03b12" }, + { url = "https://mirrors.aliyun.com/pypi/packages/19/bc/e7140ed90c5d61d77cea142eed9f9c303f4c4806f60a1044c13e3f1471d0/markupsafe-3.0.3-cp313-cp313-win32.whl", hash = "sha256:bdd37121970bfd8be76c5fb069c7751683bdf373db1ed6c010162b2a130248ed" }, + { url = "https://mirrors.aliyun.com/pypi/packages/05/73/c4abe620b841b6b791f2edc248f556900667a5a1cf023a6646967ae98335/markupsafe-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:9a1abfdc021a164803f4d485104931fb8f8c1efd55bc6b748d2f5774e78b62c5" }, + { url = "https://mirrors.aliyun.com/pypi/packages/f0/3a/fa34a0f7cfef23cf9500d68cb7c32dd64ffd58a12b09225fb03dd37d5b80/markupsafe-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:7e68f88e5b8799aa49c85cd116c932a1ac15caaa3f5db09087854d218359e485" }, + { url = "https://mirrors.aliyun.com/pypi/packages/e4/d7/e05cd7efe43a88a17a37b3ae96e79a19e846f3f456fe79c57ca61356ef01/markupsafe-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:218551f6df4868a8d527e3062d0fb968682fe92054e89978594c28e642c43a73" }, + { url = "https://mirrors.aliyun.com/pypi/packages/99/9e/e412117548182ce2148bdeacdda3bb494260c0b0184360fe0d56389b523b/markupsafe-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3524b778fe5cfb3452a09d31e7b5adefeea8c5be1d43c4f810ba09f2ceb29d37" }, + { url = "https://mirrors.aliyun.com/pypi/packages/bc/e6/fa0ffcda717ef64a5108eaa7b4f5ed28d56122c9a6d70ab8b72f9f715c80/markupsafe-3.0.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4e885a3d1efa2eadc93c894a21770e4bc67899e3543680313b09f139e149ab19" }, + { url = "https://mirrors.aliyun.com/pypi/packages/96/ec/2102e881fe9d25fc16cb4b25d5f5cde50970967ffa5dddafdb771237062d/markupsafe-3.0.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8709b08f4a89aa7586de0aadc8da56180242ee0ada3999749b183aa23df95025" }, + { url = "https://mirrors.aliyun.com/pypi/packages/4b/30/6f2fce1f1f205fc9323255b216ca8a235b15860c34b6798f810f05828e32/markupsafe-3.0.3-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b8512a91625c9b3da6f127803b166b629725e68af71f8184ae7e7d54686a56d6" }, + { url = "https://mirrors.aliyun.com/pypi/packages/58/47/4a0ccea4ab9f5dcb6f79c0236d954acb382202721e704223a8aafa38b5c8/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9b79b7a16f7fedff2495d684f2b59b0457c3b493778c9eed31111be64d58279f" }, + { url = "https://mirrors.aliyun.com/pypi/packages/6a/70/3780e9b72180b6fecb83a4814d84c3bf4b4ae4bf0b19c27196104149734c/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:12c63dfb4a98206f045aa9563db46507995f7ef6d83b2f68eda65c307c6829eb" }, + { url = "https://mirrors.aliyun.com/pypi/packages/98/c5/c03c7f4125180fc215220c035beac6b9cb684bc7a067c84fc69414d315f5/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8f71bc33915be5186016f675cd83a1e08523649b0e33efdb898db577ef5bb009" }, + { url = "https://mirrors.aliyun.com/pypi/packages/80/d6/2d1b89f6ca4bff1036499b1e29a1d02d282259f3681540e16563f27ebc23/markupsafe-3.0.3-cp313-cp313t-win32.whl", hash = "sha256:69c0b73548bc525c8cb9a251cddf1931d1db4d2258e9599c28c07ef3580ef354" }, + { url = "https://mirrors.aliyun.com/pypi/packages/2b/98/e48a4bfba0a0ffcf9925fe2d69240bfaa19c6f7507b8cd09c70684a53c1e/markupsafe-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1b4b79e8ebf6b55351f0d91fe80f893b4743f104bff22e90697db1590e47a218" }, + { url = "https://mirrors.aliyun.com/pypi/packages/0e/72/e3cc540f351f316e9ed0f092757459afbc595824ca724cbc5a5d4263713f/markupsafe-3.0.3-cp313-cp313t-win_arm64.whl", hash = "sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287" }, + { url = "https://mirrors.aliyun.com/pypi/packages/33/8a/8e42d4838cd89b7dde187011e97fe6c3af66d8c044997d2183fbd6d31352/markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe" }, + { url = "https://mirrors.aliyun.com/pypi/packages/b5/64/7660f8a4a8e53c924d0fa05dc3a55c9cee10bbd82b11c5afb27d44b096ce/markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026" }, + { url = "https://mirrors.aliyun.com/pypi/packages/da/ef/e648bfd021127bef5fa12e1720ffed0c6cbb8310c8d9bea7266337ff06de/markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737" }, + { url = "https://mirrors.aliyun.com/pypi/packages/41/3c/a36c2450754618e62008bf7435ccb0f88053e07592e6028a34776213d877/markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97" }, + { url = "https://mirrors.aliyun.com/pypi/packages/bc/20/b7fdf89a8456b099837cd1dc21974632a02a999ec9bf7ca3e490aacd98e7/markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/9a/a7/591f592afdc734f47db08a75793a55d7fbcc6902a723ae4cfbab61010cc5/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda" }, + { url = "https://mirrors.aliyun.com/pypi/packages/7d/33/45b24e4f44195b26521bc6f1a82197118f74df348556594bd2262bda1038/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf" }, + { url = "https://mirrors.aliyun.com/pypi/packages/ff/0e/53dfaca23a69fbfbbf17a4b64072090e70717344c52eaaaa9c5ddff1e5f0/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe" }, + { url = "https://mirrors.aliyun.com/pypi/packages/46/11/f333a06fc16236d5238bfe74daccbca41459dcd8d1fa952e8fbd5dccfb70/markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9" }, + { url = "https://mirrors.aliyun.com/pypi/packages/28/52/182836104b33b444e400b14f797212f720cbc9ed6ba34c800639d154e821/markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581" }, + { url = "https://mirrors.aliyun.com/pypi/packages/6f/18/acf23e91bd94fd7b3031558b1f013adfa21a8e407a3fdb32745538730382/markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4" }, + { url = "https://mirrors.aliyun.com/pypi/packages/3c/f0/57689aa4076e1b43b15fdfa646b04653969d50cf30c32a102762be2485da/markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab" }, + { url = "https://mirrors.aliyun.com/pypi/packages/89/c3/2e67a7ca217c6912985ec766c6393b636fb0c2344443ff9d91404dc4c79f/markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175" }, + { url = "https://mirrors.aliyun.com/pypi/packages/f0/00/be561dce4e6ca66b15276e184ce4b8aec61fe83662cce2f7d72bd3249d28/markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634" }, + { url = "https://mirrors.aliyun.com/pypi/packages/50/09/c419f6f5a92e5fadde27efd190eca90f05e1261b10dbd8cbcb39cd8ea1dc/markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50" }, + { url = "https://mirrors.aliyun.com/pypi/packages/22/44/a0681611106e0b2921b3033fc19bc53323e0b50bc70cffdd19f7d679bb66/markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e" }, + { url = "https://mirrors.aliyun.com/pypi/packages/5f/57/1b0b3f100259dc9fffe780cfb60d4be71375510e435efec3d116b6436d43/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5" }, + { url = "https://mirrors.aliyun.com/pypi/packages/26/6a/4bf6d0c97c4920f1597cc14dd720705eca0bf7c787aebc6bb4d1bead5388/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523" }, + { url = "https://mirrors.aliyun.com/pypi/packages/14/c7/ca723101509b518797fedc2fdf79ba57f886b4aca8a7d31857ba3ee8281f/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc" }, + { url = "https://mirrors.aliyun.com/pypi/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9" }, + { url = "https://mirrors.aliyun.com/pypi/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa" }, +] + +[[package]] +name = "mpmath" +version = "1.3.0" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c" }, +] + +[[package]] +name = "networkx" +version = "3.4.2" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +resolution-markers = [ + "python_full_version < '3.11'", +] +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/b9/54/dd730b32ea14ea797530a4479b2ed46a6fb250f682a9cfb997e968bf0261/networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f" }, +] + +[[package]] +name = "networkx" +version = "3.6.1" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", +] +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762" }, +] + +[[package]] +name = "ninja" +version = "1.13.0" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/43/73/79a0b22fc731989c708068427579e840a6cf4e937fe7ae5c5d0b7356ac22/ninja-1.13.0.tar.gz", hash = "sha256:4a40ce995ded54d9dc24f8ea37ff3bf62ad192b547f6c7126e7e25045e76f978" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/3c/74/d02409ed2aa865e051b7edda22ad416a39d81a84980f544f8de717cab133/ninja-1.13.0-py3-none-macosx_10_9_universal2.whl", hash = "sha256:fa2a8bfc62e31b08f83127d1613d10821775a0eb334197154c4d6067b7068ff1" }, + { url = "https://mirrors.aliyun.com/pypi/packages/8e/de/6e1cd6b84b412ac1ef327b76f0641aeb5dcc01e9d3f9eee0286d0c34fd93/ninja-1.13.0-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3d00c692fb717fd511abeb44b8c5d00340c36938c12d6538ba989fe764e79630" }, + { url = "https://mirrors.aliyun.com/pypi/packages/c8/83/49320fb6e58ae3c079381e333575fdbcf1cca3506ee160a2dcce775046fa/ninja-1.13.0-py3-none-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:be7f478ff9f96a128b599a964fc60a6a87b9fa332ee1bd44fa243ac88d50291c" }, + { url = "https://mirrors.aliyun.com/pypi/packages/56/c7/ba22748fb59f7f896b609cd3e568d28a0a367a6d953c24c461fe04fc4433/ninja-1.13.0-py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:60056592cf495e9a6a4bea3cd178903056ecb0943e4de45a2ea825edb6dc8d3e" }, + { url = "https://mirrors.aliyun.com/pypi/packages/79/22/d1de07632b78ac8e6b785f41fa9aad7a978ec8c0a1bf15772def36d77aac/ninja-1.13.0-py3-none-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:1c97223cdda0417f414bf864cfb73b72d8777e57ebb279c5f6de368de0062988" }, + { url = "https://mirrors.aliyun.com/pypi/packages/ed/de/0e6edf44d6a04dabd0318a519125ed0415ce437ad5a1ec9b9be03d9048cf/ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fb46acf6b93b8dd0322adc3a4945452a4e774b75b91293bafcc7b7f8e6517dfa" }, + { url = "https://mirrors.aliyun.com/pypi/packages/54/28/938b562f9057aaa4d6bfbeaa05e81899a47aebb3ba6751e36c027a7f5ff7/ninja-1.13.0-py3-none-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4be9c1b082d244b1ad7ef41eb8ab088aae8c109a9f3f0b3e56a252d3e00f42c1" }, + { url = "https://mirrors.aliyun.com/pypi/packages/2a/fb/d06a3838de4f8ab866e44ee52a797b5491df823901c54943b2adb0389fbb/ninja-1.13.0-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:6739d3352073341ad284246f81339a384eec091d9851a886dfa5b00a6d48b3e2" }, + { url = "https://mirrors.aliyun.com/pypi/packages/31/bf/0d7808af695ceddc763cf251b84a9892cd7f51622dc8b4c89d5012779f06/ninja-1.13.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:11be2d22027bde06f14c343f01d31446747dbb51e72d00decca2eb99be911e2f" }, + { url = "https://mirrors.aliyun.com/pypi/packages/9d/70/c99d0c2c809f992752453cce312848abb3b1607e56d4cd1b6cded317351a/ninja-1.13.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:aa45b4037b313c2f698bc13306239b8b93b4680eb47e287773156ac9e9304714" }, + { url = "https://mirrors.aliyun.com/pypi/packages/9f/43/c217b1153f0e499652f5e0766da8523ce3480f0a951039c7af115e224d55/ninja-1.13.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:5f8e1e8a1a30835eeb51db05cf5a67151ad37542f5a4af2a438e9490915e5b72" }, + { url = "https://mirrors.aliyun.com/pypi/packages/8c/45/9151bba2c8d0ae2b6260f71696330590de5850e5574b7b5694dce6023e20/ninja-1.13.0-py3-none-musllinux_1_2_ppc64le.whl", hash = "sha256:3d7d7779d12cb20c6d054c61b702139fd23a7a964ec8f2c823f1ab1b084150db" }, + { url = "https://mirrors.aliyun.com/pypi/packages/3c/fb/95752eb635bb8ad27d101d71bef15bc63049de23f299e312878fc21cb2da/ninja-1.13.0-py3-none-musllinux_1_2_riscv64.whl", hash = "sha256:d741a5e6754e0bda767e3274a0f0deeef4807f1fec6c0d7921a0244018926ae5" }, + { url = "https://mirrors.aliyun.com/pypi/packages/c1/31/aa56a1a286703800c0cbe39fb4e82811c277772dc8cd084f442dd8e2938a/ninja-1.13.0-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:e8bad11f8a00b64137e9b315b137d8bb6cbf3086fbdc43bf1f90fd33324d2e96" }, + { url = "https://mirrors.aliyun.com/pypi/packages/34/6f/5f5a54a1041af945130abdb2b8529cbef0cdcbbf9bcf3f4195378319d29a/ninja-1.13.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b4f2a072db3c0f944c32793e91532d8948d20d9ab83da9c0c7c15b5768072200" }, + { url = "https://mirrors.aliyun.com/pypi/packages/95/97/51359c77527d45943fe7a94d00a3843b81162e6c4244b3579fe8fc54cb9c/ninja-1.13.0-py3-none-win32.whl", hash = "sha256:8cfbb80b4a53456ae8a39f90ae3d7a2129f45ea164f43fadfa15dc38c4aef1c9" }, + { url = "https://mirrors.aliyun.com/pypi/packages/29/45/c0adfbfb0b5895aa18cec400c535b4f7ff3e52536e0403602fc1a23f7de9/ninja-1.13.0-py3-none-win_amd64.whl", hash = "sha256:fb8ee8719f8af47fed145cced4a85f0755dd55d45b2bddaf7431fa89803c5f3e" }, + { url = "https://mirrors.aliyun.com/pypi/packages/df/93/a7b983643d1253bb223234b5b226e69de6cda02b76cdca7770f684b795f5/ninja-1.13.0-py3-none-win_arm64.whl", hash = "sha256:3c0b40b1f0bba764644385319028650087b4c1b18cdfa6f45cb39a3669b81aa9" }, +] + +[[package]] +name = "numpy" +version = "2.2.6" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +resolution-markers = [ + "python_full_version < '3.11'", +] +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/9a/3e/ed6db5be21ce87955c0cbd3009f2803f59fa08df21b5df06862e2d8e2bdd/numpy-2.2.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b412caa66f72040e6d268491a59f2c43bf03eb6c96dd8f0307829feb7fa2b6fb" }, + { url = "https://mirrors.aliyun.com/pypi/packages/22/c2/4b9221495b2a132cc9d2eb862e21d42a009f5a60e45fc44b00118c174bff/numpy-2.2.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e41fd67c52b86603a91c1a505ebaef50b3314de0213461c7a6e99c9a3beff90" }, + { url = "https://mirrors.aliyun.com/pypi/packages/fd/77/dc2fcfc66943c6410e2bf598062f5959372735ffda175b39906d54f02349/numpy-2.2.6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:37e990a01ae6ec7fe7fa1c26c55ecb672dd98b19c3d0e1d1f326fa13cb38d163" }, + { url = "https://mirrors.aliyun.com/pypi/packages/7a/4f/1cb5fdc353a5f5cc7feb692db9b8ec2c3d6405453f982435efc52561df58/numpy-2.2.6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:5a6429d4be8ca66d889b7cf70f536a397dc45ba6faeb5f8c5427935d9592e9cf" }, + { url = "https://mirrors.aliyun.com/pypi/packages/eb/17/96a3acd228cec142fcb8723bd3cc39c2a474f7dcf0a5d16731980bcafa95/numpy-2.2.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efd28d4e9cd7d7a8d39074a4d44c63eda73401580c5c76acda2ce969e0a38e83" }, + { url = "https://mirrors.aliyun.com/pypi/packages/b4/63/3de6a34ad7ad6646ac7d2f55ebc6ad439dbbf9c4370017c50cf403fb19b5/numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc7b73d02efb0e18c000e9ad8b83480dfcd5dfd11065997ed4c6747470ae8915" }, + { url = "https://mirrors.aliyun.com/pypi/packages/07/b6/89d837eddef52b3d0cec5c6ba0456c1bf1b9ef6a6672fc2b7873c3ec4e2e/numpy-2.2.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:74d4531beb257d2c3f4b261bfb0fc09e0f9ebb8842d82a7b4209415896adc680" }, + { url = "https://mirrors.aliyun.com/pypi/packages/01/c8/dc6ae86e3c61cfec1f178e5c9f7858584049b6093f843bca541f94120920/numpy-2.2.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8fc377d995680230e83241d8a96def29f204b5782f371c532579b4f20607a289" }, + { url = "https://mirrors.aliyun.com/pypi/packages/5b/c5/0064b1b7e7c89137b471ccec1fd2282fceaae0ab3a9550f2568782d80357/numpy-2.2.6-cp310-cp310-win32.whl", hash = "sha256:b093dd74e50a8cba3e873868d9e93a85b78e0daf2e98c6797566ad8044e8363d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/a3/dd/4b822569d6b96c39d1215dbae0582fd99954dcbcf0c1a13c61783feaca3f/numpy-2.2.6-cp310-cp310-win_amd64.whl", hash = "sha256:f0fd6321b839904e15c46e0d257fdd101dd7f530fe03fd6359c1ea63738703f3" }, + { url = "https://mirrors.aliyun.com/pypi/packages/da/a8/4f83e2aa666a9fbf56d6118faaaf5f1974d456b1823fda0a176eff722839/numpy-2.2.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f9f1adb22318e121c5c69a09142811a201ef17ab257a1e66ca3025065b7f53ae" }, + { url = "https://mirrors.aliyun.com/pypi/packages/b3/2b/64e1affc7972decb74c9e29e5649fac940514910960ba25cd9af4488b66c/numpy-2.2.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c820a93b0255bc360f53eca31a0e676fd1101f673dda8da93454a12e23fc5f7a" }, + { url = "https://mirrors.aliyun.com/pypi/packages/4a/9f/0121e375000b5e50ffdd8b25bf78d8e1a5aa4cca3f185d41265198c7b834/numpy-2.2.6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3d70692235e759f260c3d837193090014aebdf026dfd167834bcba43e30c2a42" }, + { url = "https://mirrors.aliyun.com/pypi/packages/31/0d/b48c405c91693635fbe2dcd7bc84a33a602add5f63286e024d3b6741411c/numpy-2.2.6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:481b49095335f8eed42e39e8041327c05b0f6f4780488f61286ed3c01368d491" }, + { url = "https://mirrors.aliyun.com/pypi/packages/52/b8/7f0554d49b565d0171eab6e99001846882000883998e7b7d9f0d98b1f934/numpy-2.2.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b64d8d4d17135e00c8e346e0a738deb17e754230d7e0810ac5012750bbd85a5a" }, + { url = "https://mirrors.aliyun.com/pypi/packages/b3/dd/2238b898e51bd6d389b7389ffb20d7f4c10066d80351187ec8e303a5a475/numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba10f8411898fc418a521833e014a77d3ca01c15b0c6cdcce6a0d2897e6dbbdf" }, + { url = "https://mirrors.aliyun.com/pypi/packages/83/6c/44d0325722cf644f191042bf47eedad61c1e6df2432ed65cbe28509d404e/numpy-2.2.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bd48227a919f1bafbdda0583705e547892342c26fb127219d60a5c36882609d1" }, + { url = "https://mirrors.aliyun.com/pypi/packages/ae/9d/81e8216030ce66be25279098789b665d49ff19eef08bfa8cb96d4957f422/numpy-2.2.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9551a499bf125c1d4f9e250377c1ee2eddd02e01eac6644c080162c0c51778ab" }, + { url = "https://mirrors.aliyun.com/pypi/packages/6a/fd/e19617b9530b031db51b0926eed5345ce8ddc669bb3bc0044b23e275ebe8/numpy-2.2.6-cp311-cp311-win32.whl", hash = "sha256:0678000bb9ac1475cd454c6b8c799206af8107e310843532b04d49649c717a47" }, + { url = "https://mirrors.aliyun.com/pypi/packages/31/0a/f354fb7176b81747d870f7991dc763e157a934c717b67b58456bc63da3df/numpy-2.2.6-cp311-cp311-win_amd64.whl", hash = "sha256:e8213002e427c69c45a52bbd94163084025f533a55a59d6f9c5b820774ef3303" }, + { url = "https://mirrors.aliyun.com/pypi/packages/82/5d/c00588b6cf18e1da539b45d3598d3557084990dcc4331960c15ee776ee41/numpy-2.2.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41c5a21f4a04fa86436124d388f6ed60a9343a6f767fced1a8a71c3fbca038ff" }, + { url = "https://mirrors.aliyun.com/pypi/packages/66/ee/560deadcdde6c2f90200450d5938f63a34b37e27ebff162810f716f6a230/numpy-2.2.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de749064336d37e340f640b05f24e9e3dd678c57318c7289d222a8a2f543e90c" }, + { url = "https://mirrors.aliyun.com/pypi/packages/3c/65/4baa99f1c53b30adf0acd9a5519078871ddde8d2339dc5a7fde80d9d87da/numpy-2.2.6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:894b3a42502226a1cac872f840030665f33326fc3dac8e57c607905773cdcde3" }, + { url = "https://mirrors.aliyun.com/pypi/packages/cc/89/e5a34c071a0570cc40c9a54eb472d113eea6d002e9ae12bb3a8407fb912e/numpy-2.2.6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:71594f7c51a18e728451bb50cc60a3ce4e6538822731b2933209a1f3614e9282" }, + { url = "https://mirrors.aliyun.com/pypi/packages/f8/35/8c80729f1ff76b3921d5c9487c7ac3de9b2a103b1cd05e905b3090513510/numpy-2.2.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2618db89be1b4e05f7a1a847a9c1c0abd63e63a1607d892dd54668dd92faf87" }, + { url = "https://mirrors.aliyun.com/pypi/packages/8c/3d/1e1db36cfd41f895d266b103df00ca5b3cbe965184df824dec5c08c6b803/numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd83c01228a688733f1ded5201c678f0c53ecc1006ffbc404db9f7a899ac6249" }, + { url = "https://mirrors.aliyun.com/pypi/packages/61/c6/03ed30992602c85aa3cd95b9070a514f8b3c33e31124694438d88809ae36/numpy-2.2.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:37c0ca431f82cd5fa716eca9506aefcabc247fb27ba69c5062a6d3ade8cf8f49" }, + { url = "https://mirrors.aliyun.com/pypi/packages/b7/25/5761d832a81df431e260719ec45de696414266613c9ee268394dd5ad8236/numpy-2.2.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fe27749d33bb772c80dcd84ae7e8df2adc920ae8297400dabec45f0dedb3f6de" }, + { url = "https://mirrors.aliyun.com/pypi/packages/57/0a/72d5a3527c5ebffcd47bde9162c39fae1f90138c961e5296491ce778e682/numpy-2.2.6-cp312-cp312-win32.whl", hash = "sha256:4eeaae00d789f66c7a25ac5f34b71a7035bb474e679f410e5e1a94deb24cf2d4" }, + { url = "https://mirrors.aliyun.com/pypi/packages/36/fa/8c9210162ca1b88529ab76b41ba02d433fd54fecaf6feb70ef9f124683f1/numpy-2.2.6-cp312-cp312-win_amd64.whl", hash = "sha256:c1f9540be57940698ed329904db803cf7a402f3fc200bfe599334c9bd84a40b2" }, + { url = "https://mirrors.aliyun.com/pypi/packages/f9/5c/6657823f4f594f72b5471f1db1ab12e26e890bb2e41897522d134d2a3e81/numpy-2.2.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0811bb762109d9708cca4d0b13c4f67146e3c3b7cf8d34018c722adb2d957c84" }, + { url = "https://mirrors.aliyun.com/pypi/packages/dc/9e/14520dc3dadf3c803473bd07e9b2bd1b69bc583cb2497b47000fed2fa92f/numpy-2.2.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:287cc3162b6f01463ccd86be154f284d0893d2b3ed7292439ea97eafa8170e0b" }, + { url = "https://mirrors.aliyun.com/pypi/packages/4f/06/7e96c57d90bebdce9918412087fc22ca9851cceaf5567a45c1f404480e9e/numpy-2.2.6-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:f1372f041402e37e5e633e586f62aa53de2eac8d98cbfb822806ce4bbefcb74d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/73/ed/63d920c23b4289fdac96ddbdd6132e9427790977d5457cd132f18e76eae0/numpy-2.2.6-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:55a4d33fa519660d69614a9fad433be87e5252f4b03850642f88993f7b2ca566" }, + { url = "https://mirrors.aliyun.com/pypi/packages/85/c5/e19c8f99d83fd377ec8c7e0cf627a8049746da54afc24ef0a0cb73d5dfb5/numpy-2.2.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f92729c95468a2f4f15e9bb94c432a9229d0d50de67304399627a943201baa2f" }, + { url = "https://mirrors.aliyun.com/pypi/packages/19/49/4df9123aafa7b539317bf6d342cb6d227e49f7a35b99c287a6109b13dd93/numpy-2.2.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1bc23a79bfabc5d056d106f9befb8d50c31ced2fbc70eedb8155aec74a45798f" }, + { url = "https://mirrors.aliyun.com/pypi/packages/b2/6c/04b5f47f4f32f7c2b0e7260442a8cbcf8168b0e1a41ff1495da42f42a14f/numpy-2.2.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e3143e4451880bed956e706a3220b4e5cf6172ef05fcc397f6f36a550b1dd868" }, + { url = "https://mirrors.aliyun.com/pypi/packages/17/0a/5cd92e352c1307640d5b6fec1b2ffb06cd0dabe7d7b8227f97933d378422/numpy-2.2.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b4f13750ce79751586ae2eb824ba7e1e8dba64784086c98cdbbcc6a42112ce0d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/f0/3b/5cba2b1d88760ef86596ad0f3d484b1cbff7c115ae2429678465057c5155/numpy-2.2.6-cp313-cp313-win32.whl", hash = "sha256:5beb72339d9d4fa36522fc63802f469b13cdbe4fdab4a288f0c441b74272ebfd" }, + { url = "https://mirrors.aliyun.com/pypi/packages/cb/3b/d58c12eafcb298d4e6d0d40216866ab15f59e55d148a5658bb3132311fcf/numpy-2.2.6-cp313-cp313-win_amd64.whl", hash = "sha256:b0544343a702fa80c95ad5d3d608ea3599dd54d4632df855e4c8d24eb6ecfa1c" }, + { url = "https://mirrors.aliyun.com/pypi/packages/6b/9e/4bf918b818e516322db999ac25d00c75788ddfd2d2ade4fa66f1f38097e1/numpy-2.2.6-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0bca768cd85ae743b2affdc762d617eddf3bcf8724435498a1e80132d04879e6" }, + { url = "https://mirrors.aliyun.com/pypi/packages/61/66/d2de6b291507517ff2e438e13ff7b1e2cdbdb7cb40b3ed475377aece69f9/numpy-2.2.6-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:fc0c5673685c508a142ca65209b4e79ed6740a4ed6b2267dbba90f34b0b3cfda" }, + { url = "https://mirrors.aliyun.com/pypi/packages/e4/25/480387655407ead912e28ba3a820bc69af9adf13bcbe40b299d454ec011f/numpy-2.2.6-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:5bd4fc3ac8926b3819797a7c0e2631eb889b4118a9898c84f585a54d475b7e40" }, + { url = "https://mirrors.aliyun.com/pypi/packages/aa/4a/6e313b5108f53dcbf3aca0c0f3e9c92f4c10ce57a0a721851f9785872895/numpy-2.2.6-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:fee4236c876c4e8369388054d02d0e9bb84821feb1a64dd59e137e6511a551f8" }, + { url = "https://mirrors.aliyun.com/pypi/packages/b7/30/172c2d5c4be71fdf476e9de553443cf8e25feddbe185e0bd88b096915bcc/numpy-2.2.6-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e1dda9c7e08dc141e0247a5b8f49cf05984955246a327d4c48bda16821947b2f" }, + { url = "https://mirrors.aliyun.com/pypi/packages/12/fb/9e743f8d4e4d3c710902cf87af3512082ae3d43b945d5d16563f26ec251d/numpy-2.2.6-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f447e6acb680fd307f40d3da4852208af94afdfab89cf850986c3ca00562f4fa" }, + { url = "https://mirrors.aliyun.com/pypi/packages/12/75/ee20da0e58d3a66f204f38916757e01e33a9737d0b22373b3eb5a27358f9/numpy-2.2.6-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:389d771b1623ec92636b0786bc4ae56abafad4a4c513d36a55dce14bd9ce8571" }, + { url = "https://mirrors.aliyun.com/pypi/packages/76/95/bef5b37f29fc5e739947e9ce5179ad402875633308504a52d188302319c8/numpy-2.2.6-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8e9ace4a37db23421249ed236fdcdd457d671e25146786dfc96835cd951aa7c1" }, + { url = "https://mirrors.aliyun.com/pypi/packages/09/04/f2f83279d287407cf36a7a8053a5abe7be3622a4363337338f2585e4afda/numpy-2.2.6-cp313-cp313t-win32.whl", hash = "sha256:038613e9fb8c72b0a41f025a7e4c3f0b7a1b5d768ece4796b674c8f3fe13efff" }, + { url = "https://mirrors.aliyun.com/pypi/packages/67/0e/35082d13c09c02c011cf21570543d202ad929d961c02a147493cb0c2bdf5/numpy-2.2.6-cp313-cp313t-win_amd64.whl", hash = "sha256:6031dd6dfecc0cf9f668681a37648373bddd6421fff6c66ec1624eed0180ee06" }, + { url = "https://mirrors.aliyun.com/pypi/packages/9e/3b/d94a75f4dbf1ef5d321523ecac21ef23a3cd2ac8b78ae2aac40873590229/numpy-2.2.6-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0b605b275d7bd0c640cad4e5d30fa701a8d59302e127e5f79138ad62762c3e3d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/17/f4/09b2fa1b58f0fb4f7c7963a1649c64c4d315752240377ed74d9cd878f7b5/numpy-2.2.6-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:7befc596a7dc9da8a337f79802ee8adb30a552a94f792b9c9d18c840055907db" }, + { url = "https://mirrors.aliyun.com/pypi/packages/af/30/feba75f143bdc868a1cc3f44ccfa6c4b9ec522b36458e738cd00f67b573f/numpy-2.2.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce47521a4754c8f4593837384bd3424880629f718d87c5d44f8ed763edd63543" }, + { url = "https://mirrors.aliyun.com/pypi/packages/37/48/ac2a9584402fb6c0cd5b5d1a91dcf176b15760130dd386bbafdbfe3640bf/numpy-2.2.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d042d24c90c41b54fd506da306759e06e568864df8ec17ccc17e9e884634fd00" }, +] + +[[package]] +name = "numpy" +version = "2.4.4" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", +] +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/d7/9f/b8cef5bffa569759033adda9481211426f12f53299629b410340795c2514/numpy-2.4.4.tar.gz", hash = "sha256:2d390634c5182175533585cc89f3608a4682ccb173cc9bb940b2881c8d6f8fa0" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/ef/c6/4218570d8c8ecc9704b5157a3348e486e84ef4be0ed3e38218ab473c83d2/numpy-2.4.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f983334aea213c99992053ede6168500e5f086ce74fbc4acc3f2b00f5762e9db" }, + { url = "https://mirrors.aliyun.com/pypi/packages/dd/92/b4d922c4a5f5dab9ed44e6153908a5c665b71acf183a83b93b690996e39b/numpy-2.4.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:72944b19f2324114e9dc86a159787333b77874143efcf89a5167ef83cfee8af0" }, + { url = "https://mirrors.aliyun.com/pypi/packages/8a/dc/df98c095978fa6ee7b9a9387d1d58cbb3d232d0e69ad169a4ce784bde4fd/numpy-2.4.4-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:86b6f55f5a352b48d7fbfd2dbc3d5b780b2d79f4d3c121f33eb6efb22e9a2015" }, + { url = "https://mirrors.aliyun.com/pypi/packages/28/34/b3fdcec6e725409223dd27356bdf5a3c2cc2282e428218ecc9cb7acc9763/numpy-2.4.4-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:ba1f4fc670ed79f876f70082eff4f9583c15fb9a4b89d6188412de4d18ae2f40" }, + { url = "https://mirrors.aliyun.com/pypi/packages/68/62/63417c13aa35d57bee1337c67446761dc25ea6543130cf868eace6e8157b/numpy-2.4.4-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a87ec22c87be071b6bdbd27920b129b94f2fc964358ce38f3822635a3e2e03d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/cf/c5/9fcb7e0e69cef59cf10c746b84f7d58b08bc66a6b7d459783c5a4f6101a6/numpy-2.4.4-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:df3775294accfdd75f32c74ae39fcba920c9a378a2fc18a12b6820aa8c1fb502" }, + { url = "https://mirrors.aliyun.com/pypi/packages/7e/43/80020edacb3f84b9efdd1591120a4296462c23fd8db0dde1666f6ef66f13/numpy-2.4.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0d4e437e295f18ec29bc79daf55e8a47a9113df44d66f702f02a293d93a2d6dd" }, + { url = "https://mirrors.aliyun.com/pypi/packages/fd/06/af0658593b18a5f73532d377188b964f239eb0894e664a6c12f484472f97/numpy-2.4.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6aa3236c78803afbcb255045fbef97a9e25a1f6c9888357d205ddc42f4d6eba5" }, + { url = "https://mirrors.aliyun.com/pypi/packages/e6/ce/13a09ed65f5d0ce5c7dd0669250374c6e379910f97af2c08c57b0608eee4/numpy-2.4.4-cp311-cp311-win32.whl", hash = "sha256:30caa73029a225b2d40d9fae193e008e24b2026b7ee1a867b7ee8d96ca1a448e" }, + { url = "https://mirrors.aliyun.com/pypi/packages/bd/63/05d193dbb4b5eec1eca73822d80da98b511f8328ad4ae3ca4caf0f4db91d/numpy-2.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:6bbe4eb67390b0a0265a2c25458f6b90a409d5d069f1041e6aff1e27e3d9a79e" }, + { url = "https://mirrors.aliyun.com/pypi/packages/87/c5/8168052f080c26fa984c413305012be54741c9d0d74abd7fbeeccae3889f/numpy-2.4.4-cp311-cp311-win_arm64.whl", hash = "sha256:fcfe2045fd2e8f3cb0ce9d4ba6dba6333b8fa05bb8a4939c908cd43322d14c7e" }, + { url = "https://mirrors.aliyun.com/pypi/packages/28/05/32396bec30fb2263770ee910142f49c1476d08e8ad41abf8403806b520ce/numpy-2.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:15716cfef24d3a9762e3acdf87e27f58dc823d1348f765bbea6bef8c639bfa1b" }, + { url = "https://mirrors.aliyun.com/pypi/packages/c5/f3/a983d28637bfcd763a9c7aafdb6d5c0ebf3d487d1e1459ffdb57e2f01117/numpy-2.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23cbfd4c17357c81021f21540da84ee282b9c8fba38a03b7b9d09ba6b951421e" }, + { url = "https://mirrors.aliyun.com/pypi/packages/9b/fd/e5ecca1e78c05106d98028114f5c00d3eddb41207686b2b7de3e477b0e22/numpy-2.4.4-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8b3b60bb7cba2c8c81837661c488637eee696f59a877788a396d33150c35d842" }, + { url = "https://mirrors.aliyun.com/pypi/packages/de/2f/702a4594413c1a8632092beae8aba00f1d67947389369b3777aed783fdca/numpy-2.4.4-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:e4a010c27ff6f210ff4c6ef34394cd61470d01014439b192ec22552ee867f2a8" }, + { url = "https://mirrors.aliyun.com/pypi/packages/7f/37/eed308a8f56cba4d1fdf467a4fc67ef4ff4bf1c888f5fc980481890104b1/numpy-2.4.4-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f9e75681b59ddaa5e659898085ae0eaea229d054f2ac0c7e563a62205a700121" }, + { url = "https://mirrors.aliyun.com/pypi/packages/0a/0d/0e3ecece05b7a7e87ab9fb587855548da437a061326fff64a223b6dcb78a/numpy-2.4.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:81f4a14bee47aec54f883e0cad2d73986640c1590eb9bfaaba7ad17394481e6e" }, + { url = "https://mirrors.aliyun.com/pypi/packages/34/49/f2312c154b82a286758ee2f1743336d50651f8b5195db18cdb63675ff649/numpy-2.4.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:62d6b0f03b694173f9fcb1fb317f7222fd0b0b103e784c6549f5e53a27718c44" }, + { url = "https://mirrors.aliyun.com/pypi/packages/7b/e9/736d17bd77f1b0ec4f9901aaec129c00d59f5d84d5e79bba540ef12c2330/numpy-2.4.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fbc356aae7adf9e6336d336b9c8111d390a05df88f1805573ebb0807bd06fd1d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/63/f6/d417977c5f519b17c8a5c3bc9e8304b0908b0e21136fe43bf628a1343914/numpy-2.4.4-cp312-cp312-win32.whl", hash = "sha256:0d35aea54ad1d420c812bfa0385c71cd7cc5bcf7c65fed95fc2cd02fe8c79827" }, + { url = "https://mirrors.aliyun.com/pypi/packages/2d/5b/e1deebf88ff431b01b7406ca3583ab2bbb90972bbe1c568732e49c844f7e/numpy-2.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:b5f0362dc928a6ecd9db58868fca5e48485205e3855957bdedea308f8672ea4a" }, + { url = "https://mirrors.aliyun.com/pypi/packages/58/89/e4e856ac82a68c3ed64486a544977d0e7bdd18b8da75b78a577ca31c4395/numpy-2.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:846300f379b5b12cc769334464656bc882e0735d27d9726568bc932fdc49d5ec" }, + { url = "https://mirrors.aliyun.com/pypi/packages/14/1d/d0a583ce4fefcc3308806a749a536c201ed6b5ad6e1322e227ee4848979d/numpy-2.4.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:08f2e31ed5e6f04b118e49821397f12767934cfdd12a1ce86a058f91e004ee50" }, + { url = "https://mirrors.aliyun.com/pypi/packages/c1/62/2b7a48fbb745d344742c0277f01286dead15f3f68e4f359fbfcf7b48f70f/numpy-2.4.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e823b8b6edc81e747526f70f71a9c0a07ac4e7ad13020aa736bb7c9d67196115" }, + { url = "https://mirrors.aliyun.com/pypi/packages/e5/87/499737bfba066b4a3bebff24a8f1c5b2dee410b209bc6668c9be692580f0/numpy-2.4.4-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4a19d9dba1a76618dd86b164d608566f393f8ec6ac7c44f0cc879011c45e65af" }, + { url = "https://mirrors.aliyun.com/pypi/packages/cd/da/464d551604320d1491bc345efed99b4b7034143a85787aab78d5691d5a0e/numpy-2.4.4-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:d2a8490669bfe99a233298348acc2d824d496dee0e66e31b66a6022c2ad74a5c" }, + { url = "https://mirrors.aliyun.com/pypi/packages/7d/90/8d23e3b0dafd024bf31bdec225b3bb5c2dbfa6912f8a53b8659f21216cbf/numpy-2.4.4-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:45dbed2ab436a9e826e302fcdcbe9133f9b0006e5af7168afb8963a6520da103" }, + { url = "https://mirrors.aliyun.com/pypi/packages/d1/73/a9d864e42a01896bb5974475438f16086be9ba1f0d19d0bb7a07427c4a8b/numpy-2.4.4-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c901b15172510173f5cb310eae652908340f8dede90fff9e3bf6c0d8dfd92f83" }, + { url = "https://mirrors.aliyun.com/pypi/packages/34/fb/14570d65c3bde4e202a031210475ae9cde9b7686a2e7dc97ee67d2833b35/numpy-2.4.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:99d838547ace2c4aace6c4f76e879ddfe02bb58a80c1549928477862b7a6d6ed" }, + { url = "https://mirrors.aliyun.com/pypi/packages/8a/77/2ba9d87081fd41f6d640c83f26fb7351e536b7ce6dd9061b6af5904e8e46/numpy-2.4.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0aec54fd785890ecca25a6003fd9a5aed47ad607bbac5cd64f836ad8666f4959" }, + { url = "https://mirrors.aliyun.com/pypi/packages/a2/23/52666c9a41708b0853fa3b1a12c90da38c507a3074883823126d4e9d5b30/numpy-2.4.4-cp313-cp313-win32.whl", hash = "sha256:07077278157d02f65c43b1b26a3886bce886f95d20aabd11f87932750dfb14ed" }, + { url = "https://mirrors.aliyun.com/pypi/packages/57/fb/48649b4971cde70d817cf97a2a2fdc0b4d8308569f1dd2f2611959d2e0cf/numpy-2.4.4-cp313-cp313-win_amd64.whl", hash = "sha256:5c70f1cc1c4efbe316a572e2d8b9b9cc44e89b95f79ca3331553fbb63716e2bf" }, + { url = "https://mirrors.aliyun.com/pypi/packages/ba/d8/11490cddd564eb4de97b4579ef6bfe6a736cc07e94c1598590ae25415e01/numpy-2.4.4-cp313-cp313-win_arm64.whl", hash = "sha256:ef4059d6e5152fa1a39f888e344c73fdc926e1b2dd58c771d67b0acfbf2aa67d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/99/5d/dab4339177a905aad3e2221c915b35202f1ec30d750dd2e5e9d9a72b804b/numpy-2.4.4-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:4bbc7f303d125971f60ec0aaad5e12c62d0d2c925f0ab1273debd0e4ba37aba5" }, + { url = "https://mirrors.aliyun.com/pypi/packages/eb/e4/0564a65e7d3d97562ed6f9b0fd0fb0a6f559ee444092f105938b50043876/numpy-2.4.4-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:4d6d57903571f86180eb98f8f0c839fa9ebbfb031356d87f1361be91e433f5b7" }, + { url = "https://mirrors.aliyun.com/pypi/packages/29/8d/35a3a6ce5ad371afa58b4700f1c820f8f279948cca32524e0a695b0ded83/numpy-2.4.4-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:4636de7fd195197b7535f231b5de9e4b36d2c440b6e566d2e4e4746e6af0ca93" }, + { url = "https://mirrors.aliyun.com/pypi/packages/f4/da/477731acbd5a58a946c736edfdabb2ac5b34c3d08d1ba1a7b437fa0884df/numpy-2.4.4-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ad2e2ef14e0b04e544ea2fa0a36463f847f113d314aa02e5b402fdf910ef309e" }, + { url = "https://mirrors.aliyun.com/pypi/packages/e6/db/338535d9b152beabeb511579598418ba0212ce77cf9718edd70262cc4370/numpy-2.4.4-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a285b3b96f951841799528cd1f4f01cd70e7e0204b4abebac9463eecfcf2a40" }, + { url = "https://mirrors.aliyun.com/pypi/packages/e2/a9/ad248e8f58beb7a0219b413c9c7d8151c5d285f7f946c3e26695bdbbe2df/numpy-2.4.4-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f8474c4241bc18b750be2abea9d7a9ec84f46ef861dbacf86a4f6e043401f79e" }, + { url = "https://mirrors.aliyun.com/pypi/packages/b5/1a/3b88ccd3694681356f70da841630e4725a7264d6a885c8d442a697e1146b/numpy-2.4.4-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4e874c976154687c1f71715b034739b45c7711bec81db01914770373d125e392" }, + { url = "https://mirrors.aliyun.com/pypi/packages/c2/c9/fcfd5d0639222c6eac7f304829b04892ef51c96a75d479214d77e3ce6e33/numpy-2.4.4-cp313-cp313t-win32.whl", hash = "sha256:9c585a1790d5436a5374bac930dad6ed244c046ed91b2b2a3634eb2971d21008" }, + { url = "https://mirrors.aliyun.com/pypi/packages/d5/e3/3938a61d1c538aaec8ed6fd6323f57b0c2d2d2219512434c5c878db76553/numpy-2.4.4-cp313-cp313t-win_amd64.whl", hash = "sha256:93e15038125dc1e5345d9b5b68aa7f996ec33b98118d18c6ca0d0b7d6198b7e8" }, + { url = "https://mirrors.aliyun.com/pypi/packages/97/6a/7e345032cc60501721ef94e0e30b60f6b0bd601f9174ebd36389a2b86d40/numpy-2.4.4-cp313-cp313t-win_arm64.whl", hash = "sha256:0dfd3f9d3adbe2920b68b5cd3d51444e13a10792ec7154cd0a2f6e74d4ab3233" }, + { url = "https://mirrors.aliyun.com/pypi/packages/6e/06/c54062f85f673dd5c04cbe2f14c3acb8c8b95e3384869bb8cc9bff8cb9df/numpy-2.4.4-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:f169b9a863d34f5d11b8698ead99febeaa17a13ca044961aa8e2662a6c7766a0" }, + { url = "https://mirrors.aliyun.com/pypi/packages/4c/39/8a320264a84404c74cc7e79715de85d6130fa07a0898f67fb5cd5bd79908/numpy-2.4.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2483e4584a1cb3092da4470b38866634bafb223cbcd551ee047633fd2584599a" }, + { url = "https://mirrors.aliyun.com/pypi/packages/91/fb/287076b2614e1d1044235f50f03748f31fa287e3dbe6abeb35cdfa351eca/numpy-2.4.4-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:2d19e6e2095506d1736b7d80595e0f252d76b89f5e715c35e06e937679ea7d7a" }, + { url = "https://mirrors.aliyun.com/pypi/packages/63/eb/fcc338595309910de6ecabfcef2419a9ce24399680bfb149421fa2df1280/numpy-2.4.4-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:6a246d5914aa1c820c9443ddcee9c02bec3e203b0c080349533fae17727dfd1b" }, + { url = "https://mirrors.aliyun.com/pypi/packages/44/5d/e7e9044032a716cdfaa3fba27a8e874bf1c5f1912a1ddd4ed071bf8a14a6/numpy-2.4.4-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:989824e9faf85f96ec9c7761cd8d29c531ad857bfa1daa930cba85baaecf1a9a" }, + { url = "https://mirrors.aliyun.com/pypi/packages/98/7c/21252050676612625449b4807d6b695b9ce8a7c9e1c197ee6216c8a65c7c/numpy-2.4.4-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:27a8d92cd10f1382a67d7cf4db7ce18341b66438bdd9f691d7b0e48d104c2a9d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/b1/29/56d2bbef9465db24ef25393383d761a1af4f446a1df9b8cded4fe3a5a5d7/numpy-2.4.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:e44319a2953c738205bf3354537979eaa3998ed673395b964c1176083dd46252" }, + { url = "https://mirrors.aliyun.com/pypi/packages/e3/2b/a35a6d7589d21f44cea7d0a98de5ddcbb3d421b2622a5c96b1edf18707c3/numpy-2.4.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e892aff75639bbef0d2a2cfd55535510df26ff92f63c92cd84ef8d4ba5a5557f" }, + { url = "https://mirrors.aliyun.com/pypi/packages/64/c9/d52ec581f2390e0f5f85cbfd80fb83d965fc15e9f0e1aec2195faa142cde/numpy-2.4.4-cp314-cp314-win32.whl", hash = "sha256:1378871da56ca8943c2ba674530924bb8ca40cd228358a3b5f302ad60cf875fc" }, + { url = "https://mirrors.aliyun.com/pypi/packages/fa/22/4cc31a62a6c7b74a8730e31a4274c5dc80e005751e277a2ce38e675e4923/numpy-2.4.4-cp314-cp314-win_amd64.whl", hash = "sha256:715d1c092715954784bc79e1174fc2a90093dc4dc84ea15eb14dad8abdcdeb74" }, + { url = "https://mirrors.aliyun.com/pypi/packages/70/2e/14cda6f4d8e396c612d1bf97f22958e92148801d7e4f110cabebdc0eef4b/numpy-2.4.4-cp314-cp314-win_arm64.whl", hash = "sha256:2c194dd721e54ecad9ad387c1d35e63dce5c4450c6dc7dd5611283dda239aabb" }, + { url = "https://mirrors.aliyun.com/pypi/packages/b1/e8/8fed8c8d848d7ecea092dc3469643f9d10bc3a134a815a3b033da1d2039b/numpy-2.4.4-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2aa0613a5177c264ff5921051a5719d20095ea586ca88cc802c5c218d1c67d3e" }, + { url = "https://mirrors.aliyun.com/pypi/packages/05/1a/d8007a5138c179c2bf33ef44503e83d70434d2642877ee8fbb230e7c0548/numpy-2.4.4-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:42c16925aa5a02362f986765f9ebabf20de75cdefdca827d14315c568dcab113" }, + { url = "https://mirrors.aliyun.com/pypi/packages/99/64/ffb99ac6ae93faf117bcbd5c7ba48a7f45364a33e8e458545d3633615dda/numpy-2.4.4-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:874f200b2a981c647340f841730fc3a2b54c9d940566a3c4149099591e2c4c3d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/6e/6e/795cc078b78a384052e73b2f6281ff7a700e9bf53bcce2ee579d4f6dd879/numpy-2.4.4-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c9b39d38a9bd2ae1becd7eac1303d031c5c110ad31f2b319c6e7d98b135c934d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/5f/86/2acbda8cc2af5f3d7bfc791192863b9e3e19674da7b5e533fded124d1299/numpy-2.4.4-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b268594bccac7d7cf5844c7732e3f20c50921d94e36d7ec9b79e9857694b1b2f" }, + { url = "https://mirrors.aliyun.com/pypi/packages/bc/59/cafd83018f4aa55e0ac6fa92aa066c0a1877b77a615ceff1711c260ffae8/numpy-2.4.4-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:ac6b31e35612a26483e20750126d30d0941f949426974cace8e6b5c58a3657b0" }, + { url = "https://mirrors.aliyun.com/pypi/packages/f0/85/a42548db84e65ece46ab2caea3d3f78b416a47af387fcbb47ec28e660dc2/numpy-2.4.4-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8e3ed142f2728df44263aaf5fb1f5b0b99f4070c553a0d7f033be65338329150" }, + { url = "https://mirrors.aliyun.com/pypi/packages/ed/ad/483d9e262f4b831000062e5d8a45e342166ec8aaa1195264982bca267e62/numpy-2.4.4-cp314-cp314t-win32.whl", hash = "sha256:dddbbd259598d7240b18c9d87c56a9d2fb3b02fe266f49a7c101532e78c1d871" }, + { url = "https://mirrors.aliyun.com/pypi/packages/c7/03/2fc4e14c7bd4ff2964b74ba90ecb8552540b6315f201df70f137faa5c589/numpy-2.4.4-cp314-cp314t-win_amd64.whl", hash = "sha256:a7164afb23be6e37ad90b2f10426149fd75aee07ca55653d2aa41e66c4ef697e" }, + { url = "https://mirrors.aliyun.com/pypi/packages/58/78/548fb8e07b1a341746bfbecb32f2c268470f45fa028aacdbd10d9bc73aab/numpy-2.4.4-cp314-cp314t-win_arm64.whl", hash = "sha256:ba203255017337d39f89bdd58417f03c4426f12beed0440cfd933cb15f8669c7" }, + { url = "https://mirrors.aliyun.com/pypi/packages/6b/33/8fae8f964a4f63ed528264ddf25d2b683d0b663e3cba26961eb838a7c1bd/numpy-2.4.4-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:58c8b5929fcb8287cbd6f0a3fae19c6e03a5c48402ae792962ac465224a629a4" }, + { url = "https://mirrors.aliyun.com/pypi/packages/bc/d0/1aabee441380b981cf8cdda3ae7a46aa827d1b5a8cce84d14598bc94d6d9/numpy-2.4.4-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:eea7ac5d2dce4189771cedb559c738a71512768210dc4e4753b107a2048b3d0e" }, + { url = "https://mirrors.aliyun.com/pypi/packages/a5/b8/aafb0d1065416894fccf4df6b49ef22b8db045187949545bced89c034b8e/numpy-2.4.4-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:51fc224f7ca4d92656d5a5eb315f12eb5fe2c97a66249aa7b5f562528a3be38c" }, + { url = "https://mirrors.aliyun.com/pypi/packages/d6/77/063baa20b08b431038c7f9ff5435540c7b7265c78cf56012a483019ca72d/numpy-2.4.4-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:28a650663f7314afc3e6ec620f44f333c386aad9f6fc472030865dc0ebb26ee3" }, + { url = "https://mirrors.aliyun.com/pypi/packages/c7/a8/379542d45a14f149444c5c4c4e7714707239ce9cc1de8c2803958889da14/numpy-2.4.4-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:19710a9ca9992d7174e9c52f643d4272dcd1558c5f7af7f6f8190f633bd651a7" }, + { url = "https://mirrors.aliyun.com/pypi/packages/a2/c8/f0a45426d6d21e7ea3310a15cf90c43a14d9232c31a837702dba437f3373/numpy-2.4.4-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9b2aec6af35c113b05695ebb5749a787acd63cafc83086a05771d1e1cd1e555f" }, + { url = "https://mirrors.aliyun.com/pypi/packages/04/74/f4c001f4714c3ad9ce037e18cf2b9c64871a84951eaa0baf683a9ca9301c/numpy-2.4.4-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:f2cf083b324a467e1ab358c105f6cad5ea950f50524668a80c486ff1db24e119" }, +] + +[[package]] +name = "nvidia-cublas-cu12" +version = "12.8.4.1" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/29/99/db44d685f0e257ff0e213ade1964fc459b4a690a73293220e98feb3307cf/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:b86f6dd8935884615a0683b663891d43781b819ac4f2ba2b0c9604676af346d0" }, + { url = "https://mirrors.aliyun.com/pypi/packages/dc/61/e24b560ab2e2eaeb3c839129175fb330dfcfc29e5203196e5541a4c44682/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8ac4e771d5a348c551b2a426eda6193c19aa630236b418086020df5ba9667142" }, +] + +[[package]] +name = "nvidia-cuda-cupti-cu12" +version = "12.8.90" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/d5/1f/b3bd73445e5cb342727fd24fe1f7b748f690b460acadc27ea22f904502c8/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4412396548808ddfed3f17a467b104ba7751e6b58678a4b840675c56d21cf7ed" }, + { url = "https://mirrors.aliyun.com/pypi/packages/f8/02/2adcaa145158bf1a8295d83591d22e4103dbfd821bcaf6f3f53151ca4ffa/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea0cb07ebda26bb9b29ba82cda34849e73c166c18162d3913575b0c9db9a6182" }, +] + +[[package]] +name = "nvidia-cuda-nvrtc-cu12" +version = "12.8.93" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/05/6b/32f747947df2da6994e999492ab306a903659555dddc0fbdeb9d71f75e52/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:a7756528852ef889772a84c6cd89d41dfa74667e24cca16bb31f8f061e3e9994" }, + { url = "https://mirrors.aliyun.com/pypi/packages/eb/d1/e50d0acaab360482034b84b6e27ee83c6738f7d32182b987f9c7a4e32962/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc1fec1e1637854b4c0a65fb9a8346b51dd9ee69e61ebaccc82058441f15bce8" }, +] + +[[package]] +name = "nvidia-cuda-runtime-cu12" +version = "12.8.90" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/7c/75/f865a3b236e4647605ea34cc450900854ba123834a5f1598e160b9530c3a/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:52bf7bbee900262ffefe5e9d5a2a69a30d97e2bc5bb6cc866688caa976966e3d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/0d/9b/a997b638fcd068ad6e4d53b8551a7d30fe8b404d6f1804abf1df69838932/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adade8dcbd0edf427b7204d480d6066d33902cab2a4707dcfc48a2d0fd44ab90" }, +] + +[[package]] +name = "nvidia-cudnn-cu12" +version = "9.19.0.56" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +dependencies = [ + { name = "nvidia-cublas-cu12" }, +] +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/09/b8/277c51962ee46fa3e5b203ac5f76107c650f781d6891e681e28e6f3e9fe6/nvidia_cudnn_cu12-9.19.0.56-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:08caaf27fe556aca82a3ee3b5aa49a77e7de0cfcb7ff4e5c29da426387a8267e" }, + { url = "https://mirrors.aliyun.com/pypi/packages/c5/41/65225d42fba06fb3dd3972485ea258e7dd07a40d6e01c95da6766ad87354/nvidia_cudnn_cu12-9.19.0.56-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:ac6ad90a075bb33a94f2b4cf4622eac13dd4dc65cf6dd9c7572a318516a36625" }, +] + +[[package]] +name = "nvidia-cufft-cu12" +version = "11.3.3.83" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +dependencies = [ + { name = "nvidia-nvjitlink-cu12" }, +] +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/60/bc/7771846d3a0272026c416fbb7e5f4c1f146d6d80704534d0b187dd6f4800/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:848ef7224d6305cdb2a4df928759dca7b1201874787083b6e7550dd6765ce69a" }, + { url = "https://mirrors.aliyun.com/pypi/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74" }, +] + +[[package]] +name = "nvidia-cufile-cu12" +version = "1.13.1.3" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/bb/fe/1bcba1dfbfb8d01be8d93f07bfc502c93fa23afa6fd5ab3fc7c1df71038a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d069003be650e131b21c932ec3d8969c1715379251f8d23a1860554b1cb24fc" }, + { url = "https://mirrors.aliyun.com/pypi/packages/1e/f5/5607710447a6fe9fd9b3283956fceeee8a06cda1d2f56ce31371f595db2a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:4beb6d4cce47c1a0f1013d72e02b0994730359e17801d395bdcbf20cfb3bb00a" }, +] + +[[package]] +name = "nvidia-curand-cu12" +version = "10.3.9.90" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/45/5e/92aa15eca622a388b80fbf8375d4760738df6285b1e92c43d37390a33a9a/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:dfab99248034673b779bc6decafdc3404a8a6f502462201f2f31f11354204acd" }, + { url = "https://mirrors.aliyun.com/pypi/packages/fb/aa/6584b56dc84ebe9cf93226a5cde4d99080c8e90ab40f0c27bda7a0f29aa1/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:b32331d4f4df5d6eefa0554c565b626c7216f87a06a4f56fab27c3b68a830ec9" }, +] + +[[package]] +name = "nvidia-cusolver-cu12" +version = "11.7.3.90" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +dependencies = [ + { name = "nvidia-cublas-cu12" }, + { name = "nvidia-cusparse-cu12" }, + { name = "nvidia-nvjitlink-cu12" }, +] +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/c8/32/f7cd6ce8a7690544d084ea21c26e910a97e077c9b7f07bf5de623ee19981/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:db9ed69dbef9715071232caa9b69c52ac7de3a95773c2db65bdba85916e4e5c0" }, + { url = "https://mirrors.aliyun.com/pypi/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450" }, +] + +[[package]] +name = "nvidia-cusparse-cu12" +version = "12.5.8.93" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +dependencies = [ + { name = "nvidia-nvjitlink-cu12" }, +] +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/bc/f7/cd777c4109681367721b00a106f491e0d0d15cfa1fd59672ce580ce42a97/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b6c161cb130be1a07a27ea6923df8141f3c295852f4b260c65f18f3e0a091dc" }, + { url = "https://mirrors.aliyun.com/pypi/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b" }, +] + +[[package]] +name = "nvidia-cusparselt-cu12" +version = "0.7.1" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/73/b9/598f6ff36faaece4b3c50d26f50e38661499ff34346f00e057760b35cc9d/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8878dce784d0fac90131b6817b607e803c36e629ba34dc5b433471382196b6a5" }, + { url = "https://mirrors.aliyun.com/pypi/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623" }, +] + +[[package]] +name = "nvidia-nccl-cu12" +version = "2.28.9" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/08/c4/120d2dfd92dff2c776d68f361ff8705fdea2ca64e20b612fab0fd3f581ac/nvidia_nccl_cu12-2.28.9-py3-none-manylinux_2_18_aarch64.whl", hash = "sha256:50a36e01c4a090b9f9c47d92cec54964de6b9fcb3362d0e19b8ffc6323c21b60" }, + { url = "https://mirrors.aliyun.com/pypi/packages/4a/4e/44dbb46b3d1b0ec61afda8e84837870f2f9ace33c564317d59b70bc19d3e/nvidia_nccl_cu12-2.28.9-py3-none-manylinux_2_18_x86_64.whl", hash = "sha256:485776daa8447da5da39681af455aa3b2c2586ddcf4af8772495e7c532c7e5ab" }, +] + +[[package]] +name = "nvidia-nvjitlink-cu12" +version = "12.8.93" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/f6/74/86a07f1d0f42998ca31312f998bd3b9a7eff7f52378f4f270c8679c77fb9/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:81ff63371a7ebd6e6451970684f916be2eab07321b73c9d244dc2b4da7f73b88" }, + { url = "https://mirrors.aliyun.com/pypi/packages/2a/a2/8cee5da30d13430e87bf99bb33455d2724d0a4a9cb5d7926d80ccb96d008/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:adccd7161ace7261e01bb91e44e88da350895c270d23f744f0820c818b7229e7" }, +] + +[[package]] +name = "nvidia-nvshmem-cu12" +version = "3.4.5" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/1d/6a/03aa43cc9bd3ad91553a88b5f6fb25ed6a3752ae86ce2180221962bc2aa5/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b48363fc6964dede448029434c6abed6c5e37f823cb43c3bcde7ecfc0457e15" }, + { url = "https://mirrors.aliyun.com/pypi/packages/b5/09/6ea3ea725f82e1e76684f0708bbedd871fc96da89945adeba65c3835a64c/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:042f2500f24c021db8a06c5eec2539027d57460e1c1a762055a6554f72c369bd" }, +] + +[[package]] +name = "nvidia-nvtx-cu12" +version = "12.8.90" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/10/c0/1b303feea90d296f6176f32a2a70b5ef230f9bdeb3a72bddb0dc922dc137/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d7ad891da111ebafbf7e015d34879f7112832fc239ff0d7d776b6cb685274615" }, + { url = "https://mirrors.aliyun.com/pypi/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f" }, +] + +[[package]] +name = "packaging" +version = "26.0" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529" }, +] + +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746" }, +] + +[[package]] +name = "pygments" +version = "2.20.0" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176" }, +] + +[[package]] +name = "pytest" +version = "9.0.3" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9" }, +] + +[[package]] +name = "setuptools" +version = "81.0.0" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/0d/1c/73e719955c59b8e424d015ab450f51c0af856ae46ea2da83eba51cc88de1/setuptools-81.0.0.tar.gz", hash = "sha256:487b53915f52501f0a79ccfd0c02c165ffe06631443a886740b91af4b7a5845a" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/e1/e3/c164c88b2e5ce7b24d667b9bd83589cf4f3520d97cad01534cd3c4f55fdb/setuptools-81.0.0-py3-none-any.whl", hash = "sha256:fdd925d5c5d9f62e4b74b30d6dd7828ce236fd6ed998a08d81de62ce5a6310d6" }, +] + +[[package]] +name = "sympy" +version = "1.14.0" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +dependencies = [ + { name = "mpmath" }, +] +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5" }, +] + +[[package]] +name = "tomli" +version = "2.4.1" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/22/de/48c59722572767841493b26183a0d1cc411d54fd759c5607c4590b6563a6/tomli-2.4.1.tar.gz", hash = "sha256:7c7e1a961a0b2f2472c1ac5b69affa0ae1132c39adcb67aba98568702b9cc23f" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/f4/11/db3d5885d8528263d8adc260bb2d28ebf1270b96e98f0e0268d32b8d9900/tomli-2.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f8f0fc26ec2cc2b965b7a3b87cd19c5c6b8c5e5f436b984e85f486d652285c30" }, + { url = "https://mirrors.aliyun.com/pypi/packages/6d/f7/675db52c7e46064a9aa928885a9b20f4124ecb9bc2e1ce74c9106648d202/tomli-2.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4ab97e64ccda8756376892c53a72bd1f964e519c77236368527f758fbc36a53a" }, + { url = "https://mirrors.aliyun.com/pypi/packages/61/71/81c50943cf953efa35bce7646caab3cf457a7d8c030b27cfb40d7235f9ee/tomli-2.4.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96481a5786729fd470164b47cdb3e0e58062a496f455ee41b4403be77cb5a076" }, + { url = "https://mirrors.aliyun.com/pypi/packages/48/c1/f41d9cb618acccca7df82aaf682f9b49013c9397212cb9f53219e3abac37/tomli-2.4.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a881ab208c0baf688221f8cecc5401bd291d67e38a1ac884d6736cbcd8247e9" }, + { url = "https://mirrors.aliyun.com/pypi/packages/22/e4/5a816ecdd1f8ca51fb756ef684b90f2780afc52fc67f987e3c61d800a46d/tomli-2.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47149d5bd38761ac8be13a84864bf0b7b70bc051806bc3669ab1cbc56216b23c" }, + { url = "https://mirrors.aliyun.com/pypi/packages/6b/49/2b2a0ef529aa6eec245d25f0c703e020a73955ad7edf73e7f54ddc608aa5/tomli-2.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ec9bfaf3ad2df51ace80688143a6a4ebc09a248f6ff781a9945e51937008fcbc" }, + { url = "https://mirrors.aliyun.com/pypi/packages/83/bd/6c1a630eaca337e1e78c5903104f831bda934c426f9231429396ce3c3467/tomli-2.4.1-cp311-cp311-win32.whl", hash = "sha256:ff2983983d34813c1aeb0fa89091e76c3a22889ee83ab27c5eeb45100560c049" }, + { url = "https://mirrors.aliyun.com/pypi/packages/42/59/71461df1a885647e10b6bb7802d0b8e66480c61f3f43079e0dcd315b3954/tomli-2.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:5ee18d9ebdb417e384b58fe414e8d6af9f4e7a0ae761519fb50f721de398dd4e" }, + { url = "https://mirrors.aliyun.com/pypi/packages/b8/83/dceca96142499c069475b790e7913b1044c1a4337e700751f48ed723f883/tomli-2.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:c2541745709bad0264b7d4705ad453b76ccd191e64aa6f0fc66b69a293a45ece" }, + { url = "https://mirrors.aliyun.com/pypi/packages/c1/ba/42f134a3fe2b370f555f44b1d72feebb94debcab01676bf918d0cb70e9aa/tomli-2.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c742f741d58a28940ce01d58f0ab2ea3ced8b12402f162f4d534dfe18ba1cd6a" }, + { url = "https://mirrors.aliyun.com/pypi/packages/dc/c7/62d7a17c26487ade21c5422b646110f2162f1fcc95980ef7f63e73c68f14/tomli-2.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7f86fd587c4ed9dd76f318225e7d9b29cfc5a9d43de44e5754db8d1128487085" }, + { url = "https://mirrors.aliyun.com/pypi/packages/5c/05/79d13d7c15f13bdef410bdd49a6485b1c37d28968314eabee452c22a7fda/tomli-2.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ff18e6a727ee0ab0388507b89d1bc6a22b138d1e2fa56d1ad494586d61d2eae9" }, + { url = "https://mirrors.aliyun.com/pypi/packages/10/90/d62ce007a1c80d0b2c93e02cab211224756240884751b94ca72df8a875ca/tomli-2.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:136443dbd7e1dee43c68ac2694fde36b2849865fa258d39bf822c10e8068eac5" }, + { url = "https://mirrors.aliyun.com/pypi/packages/1a/7e/caf6496d60152ad4ed09282c1885cca4eea150bfd007da84aea07bcc0a3e/tomli-2.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5e262d41726bc187e69af7825504c933b6794dc3fbd5945e41a79bb14c31f585" }, + { url = "https://mirrors.aliyun.com/pypi/packages/99/e7/c6f69c3120de34bbd882c6fba7975f3d7a746e9218e56ab46a1bc4b42552/tomli-2.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5cb41aa38891e073ee49d55fbc7839cfdb2bc0e600add13874d048c94aadddd1" }, + { url = "https://mirrors.aliyun.com/pypi/packages/d6/2f/4a3c322f22c5c66c4b836ec58211641a4067364f5dcdd7b974b4c5da300c/tomli-2.4.1-cp312-cp312-win32.whl", hash = "sha256:da25dc3563bff5965356133435b757a795a17b17d01dbc0f42fb32447ddfd917" }, + { url = "https://mirrors.aliyun.com/pypi/packages/24/22/4daacd05391b92c55759d55eaee21e1dfaea86ce5c571f10083360adf534/tomli-2.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:52c8ef851d9a240f11a88c003eacb03c31fc1c9c4ec64a99a0f922b93874fda9" }, + { url = "https://mirrors.aliyun.com/pypi/packages/68/fd/70e768887666ddd9e9f5d85129e84910f2db2796f9096aa02b721a53098d/tomli-2.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:f758f1b9299d059cc3f6546ae2af89670cb1c4d48ea29c3cacc4fe7de3058257" }, + { url = "https://mirrors.aliyun.com/pypi/packages/07/06/b823a7e818c756d9a7123ba2cda7d07bc2dd32835648d1a7b7b7a05d848d/tomli-2.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:36d2bd2ad5fb9eaddba5226aa02c8ec3fa4f192631e347b3ed28186d43be6b54" }, + { url = "https://mirrors.aliyun.com/pypi/packages/14/6f/12645cf7f08e1a20c7eb8c297c6f11d31c1b50f316a7e7e1e1de6e2e7b7e/tomli-2.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:eb0dc4e38e6a1fd579e5d50369aa2e10acfc9cace504579b2faabb478e76941a" }, + { url = "https://mirrors.aliyun.com/pypi/packages/5c/e0/90637574e5e7212c09099c67ad349b04ec4d6020324539297b634a0192b0/tomli-2.4.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7f2c7f2b9ca6bdeef8f0fa897f8e05085923eb091721675170254cbc5b02897" }, + { url = "https://mirrors.aliyun.com/pypi/packages/10/8f/d3ddb16c5a4befdf31a23307f72828686ab2096f068eaf56631e136c1fdd/tomli-2.4.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f3c6818a1a86dd6dca7ddcaaf76947d5ba31aecc28cb1b67009a5877c9a64f3f" }, + { url = "https://mirrors.aliyun.com/pypi/packages/e3/f1/dbeeb9116715abee2485bf0a12d07a8f31af94d71608c171c45f64c0469d/tomli-2.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d312ef37c91508b0ab2cee7da26ec0b3ed2f03ce12bd87a588d771ae15dcf82d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/d3/74/16336ffd19ed4da28a70959f92f506233bd7cfc2332b20bdb01591e8b1d1/tomli-2.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51529d40e3ca50046d7606fa99ce3956a617f9b36380da3b7f0dd3dd28e68cb5" }, + { url = "https://mirrors.aliyun.com/pypi/packages/16/f9/229fa3434c590ddf6c0aa9af64d3af4b752540686cace29e6281e3458469/tomli-2.4.1-cp313-cp313-win32.whl", hash = "sha256:2190f2e9dd7508d2a90ded5ed369255980a1bcdd58e52f7fe24b8162bf9fedbd" }, + { url = "https://mirrors.aliyun.com/pypi/packages/6a/1e/71dfd96bcc1c775420cb8befe7a9d35f2e5b1309798f009dca17b7708c1e/tomli-2.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:8d65a2fbf9d2f8352685bc1364177ee3923d6baf5e7f43ea4959d7d8bc326a36" }, + { url = "https://mirrors.aliyun.com/pypi/packages/83/7a/d34f422a021d62420b78f5c538e5b102f62bea616d1d75a13f0a88acb04a/tomli-2.4.1-cp313-cp313-win_arm64.whl", hash = "sha256:4b605484e43cdc43f0954ddae319fb75f04cc10dd80d830540060ee7cd0243cd" }, + { url = "https://mirrors.aliyun.com/pypi/packages/3c/fb/9a5c8d27dbab540869f7c1f8eb0abb3244189ce780ba9cd73f3770662072/tomli-2.4.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:fd0409a3653af6c147209d267a0e4243f0ae46b011aa978b1080359fddc9b6cf" }, + { url = "https://mirrors.aliyun.com/pypi/packages/62/05/d2f816630cc771ad836af54f5001f47a6f611d2d39535364f148b6a92d6b/tomli-2.4.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a120733b01c45e9a0c34aeef92bf0cf1d56cfe81ed9d47d562f9ed591a9828ac" }, + { url = "https://mirrors.aliyun.com/pypi/packages/ce/48/66341bdb858ad9bd0ceab5a86f90eddab127cf8b046418009f2125630ecb/tomli-2.4.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:559db847dc486944896521f68d8190be1c9e719fced785720d2216fe7022b662" }, + { url = "https://mirrors.aliyun.com/pypi/packages/df/6d/c5fad00d82b3c7a3ab6189bd4b10e60466f22cfe8a08a9394185c8a8111c/tomli-2.4.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01f520d4f53ef97964a240a035ec2a869fe1a37dde002b57ebc4417a27ccd853" }, + { url = "https://mirrors.aliyun.com/pypi/packages/00/71/3a69e86f3eafe8c7a59d008d245888051005bd657760e96d5fbfb0b740c2/tomli-2.4.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7f94b27a62cfad8496c8d2513e1a222dd446f095fca8987fceef261225538a15" }, + { url = "https://mirrors.aliyun.com/pypi/packages/67/50/361e986652847fec4bd5e4a0208752fbe64689c603c7ae5ea7cb16b1c0ca/tomli-2.4.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ede3e6487c5ef5d28634ba3f31f989030ad6af71edfb0055cbbd14189ff240ba" }, + { url = "https://mirrors.aliyun.com/pypi/packages/8c/9a/b4173689a9203472e5467217e0154b00e260621caa227b6fa01feab16998/tomli-2.4.1-cp314-cp314-win32.whl", hash = "sha256:3d48a93ee1c9b79c04bb38772ee1b64dcf18ff43085896ea460ca8dec96f35f6" }, + { url = "https://mirrors.aliyun.com/pypi/packages/14/58/640ac93bf230cd27d002462c9af0d837779f8773bc03dee06b5835208214/tomli-2.4.1-cp314-cp314-win_amd64.whl", hash = "sha256:88dceee75c2c63af144e456745e10101eb67361050196b0b6af5d717254dddf7" }, + { url = "https://mirrors.aliyun.com/pypi/packages/d5/2f/702d5e05b227401c1068f0d386d79a589bb12bf64c3d2c72ce0631e3bc49/tomli-2.4.1-cp314-cp314-win_arm64.whl", hash = "sha256:b8c198f8c1805dc42708689ed6864951fd2494f924149d3e4bce7710f8eb5232" }, + { url = "https://mirrors.aliyun.com/pypi/packages/45/4b/b877b05c8ba62927d9865dd980e34a755de541eb65fffba52b4cc495d4d2/tomli-2.4.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:d4d8fe59808a54658fcc0160ecfb1b30f9089906c50b23bcb4c69eddc19ec2b4" }, + { url = "https://mirrors.aliyun.com/pypi/packages/24/79/6ab420d37a270b89f7195dec5448f79400d9e9c1826df982f3f8e97b24fd/tomli-2.4.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7008df2e7655c495dd12d2a4ad038ff878d4ca4b81fccaf82b714e07eae4402c" }, + { url = "https://mirrors.aliyun.com/pypi/packages/02/e0/3630057d8eb170310785723ed5adcdfb7d50cb7e6455f85ba8a3deed642b/tomli-2.4.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1d8591993e228b0c930c4bb0db464bdad97b3289fb981255d6c9a41aedc84b2d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/7a/b4/1613716072e544d1a7891f548d8f9ec6ce2faf42ca65acae01d76ea06bb0/tomli-2.4.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:734e20b57ba95624ecf1841e72b53f6e186355e216e5412de414e3c51e5e3c41" }, + { url = "https://mirrors.aliyun.com/pypi/packages/05/38/30f541baf6a3f6df77b3df16b01ba319221389e2da59427e221ef417ac0c/tomli-2.4.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8a650c2dbafa08d42e51ba0b62740dae4ecb9338eefa093aa5c78ceb546fcd5c" }, + { url = "https://mirrors.aliyun.com/pypi/packages/77/a3/ec9dd4fd2c38e98de34223b995a3b34813e6bdadf86c75314c928350ed14/tomli-2.4.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:504aa796fe0569bb43171066009ead363de03675276d2d121ac1a4572397870f" }, + { url = "https://mirrors.aliyun.com/pypi/packages/ef/be/605a6261cac79fba2ec0c9827e986e00323a1945700969b8ee0b30d85453/tomli-2.4.1-cp314-cp314t-win32.whl", hash = "sha256:b1d22e6e9387bf4739fbe23bfa80e93f6b0373a7f1b96c6227c32bef95a4d7a8" }, + { url = "https://mirrors.aliyun.com/pypi/packages/12/64/da524626d3b9cc40c168a13da8335fe1c51be12c0a63685cc6db7308daae/tomli-2.4.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2c1c351919aca02858f740c6d33adea0c5deea37f9ecca1cc1ef9e884a619d26" }, + { url = "https://mirrors.aliyun.com/pypi/packages/5a/cd/e80b62269fc78fc36c9af5a6b89c835baa8af28ff5ad28c7028d60860320/tomli-2.4.1-cp314-cp314t-win_arm64.whl", hash = "sha256:eab21f45c7f66c13f2a9e0e1535309cee140182a9cdae1e041d02e47291e8396" }, + { url = "https://mirrors.aliyun.com/pypi/packages/7b/61/cceae43728b7de99d9b847560c262873a1f6c98202171fd5ed62640b494b/tomli-2.4.1-py3-none-any.whl", hash = "sha256:0d85819802132122da43cb86656f8d1f8c6587d54ae7dcaf30e90533028b49fe" }, +] + +[[package]] +name = "torch" +version = "2.11.0+cu128" +source = { registry = "https://download.pytorch.org/whl/cu128" } +dependencies = [ + { name = "cuda-bindings", marker = "sys_platform == 'linux'" }, + { name = "cuda-toolkit", extra = ["cublas", "cudart", "cufft", "cufile", "cupti", "curand", "cusolver", "cusparse", "nvjitlink", "nvrtc", "nvtx"], marker = "sys_platform == 'linux'" }, + { name = "filelock" }, + { name = "fsspec" }, + { name = "jinja2" }, + { name = "networkx", version = "3.4.2", source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "networkx", version = "3.6.1", source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }, marker = "python_full_version >= '3.11'" }, + { name = "nvidia-cudnn-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusparselt-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvshmem-cu12", marker = "sys_platform == 'linux'" }, + { name = "setuptools" }, + { name = "sympy" }, + { name = "triton", marker = "sys_platform == 'linux'" }, + { name = "typing-extensions" }, +] +wheels = [ + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp310-cp310-manylinux_2_28_aarch64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp310-cp310-manylinux_2_28_x86_64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp310-cp310-win_amd64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp311-cp311-manylinux_2_28_aarch64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp311-cp311-win_amd64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp312-cp312-win_amd64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp313-cp313-manylinux_2_28_aarch64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp313-cp313-win_amd64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp313-cp313t-manylinux_2_28_aarch64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp313-cp313t-win_amd64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp314-cp314-manylinux_2_28_aarch64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp314-cp314-manylinux_2_28_x86_64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp314-cp314-win_amd64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp314-cp314t-manylinux_2_28_aarch64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp314-cp314t-manylinux_2_28_x86_64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp314-cp314t-win_amd64.whl" }, +] + +[[package]] +name = "triton" +version = "3.6.0" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/44/ba/b1b04f4b291a3205d95ebd24465de0e5bf010a2df27a4e58a9b5f039d8f2/triton-3.6.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c723cfb12f6842a0ae94ac307dba7e7a44741d720a40cf0e270ed4a4e3be781" }, + { url = "https://mirrors.aliyun.com/pypi/packages/8c/f7/f1c9d3424ab199ac53c2da567b859bcddbb9c9e7154805119f8bd95ec36f/triton-3.6.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6550fae429e0667e397e5de64b332d1e5695b73650ee75a6146e2e902770bea" }, + { url = "https://mirrors.aliyun.com/pypi/packages/0f/2c/96f92f3c60387e14cc45aed49487f3486f89ea27106c1b1376913c62abe4/triton-3.6.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49df5ef37379c0c2b5c0012286f80174fcf0e073e5ade1ca9a86c36814553651" }, + { url = "https://mirrors.aliyun.com/pypi/packages/e0/12/b05ba554d2c623bffa59922b94b0775673de251f468a9609bc9e45de95e9/triton-3.6.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8e323d608e3a9bfcc2d9efcc90ceefb764a82b99dea12a86d643c72539ad5d3" }, + { url = "https://mirrors.aliyun.com/pypi/packages/17/5d/08201db32823bdf77a0e2b9039540080b2e5c23a20706ddba942924ebcd6/triton-3.6.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:374f52c11a711fd062b4bfbb201fd9ac0a5febd28a96fb41b4a0f51dde3157f4" }, + { url = "https://mirrors.aliyun.com/pypi/packages/ab/a8/cdf8b3e4c98132f965f88c2313a4b493266832ad47fb52f23d14d4f86bb5/triton-3.6.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74caf5e34b66d9f3a429af689c1c7128daba1d8208df60e81106b115c00d6fca" }, + { url = "https://mirrors.aliyun.com/pypi/packages/3c/12/34d71b350e89a204c2c7777a9bba0dcf2f19a5bfdd70b57c4dbc5ffd7154/triton-3.6.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:448e02fe6dc898e9e5aa89cf0ee5c371e99df5aa5e8ad976a80b93334f3494fd" }, + { url = "https://mirrors.aliyun.com/pypi/packages/f9/0b/37d991d8c130ce81a8728ae3c25b6e60935838e9be1b58791f5997b24a54/triton-3.6.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:10c7f76c6e72d2ef08df639e3d0d30729112f47a56b0c81672edc05ee5116ac9" }, + { url = "https://mirrors.aliyun.com/pypi/packages/ce/4e/41b0c8033b503fd3cfcd12392cdd256945026a91ff02452bef40ec34bee7/triton-3.6.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1722e172d34e32abc3eb7711d0025bb69d7959ebea84e3b7f7a341cd7ed694d6" }, + { url = "https://mirrors.aliyun.com/pypi/packages/35/f8/9c66bfc55361ec6d0e4040a0337fb5924ceb23de4648b8a81ae9d33b2b38/triton-3.6.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d002e07d7180fd65e622134fbd980c9a3d4211fb85224b56a0a0efbd422ab72f" }, + { url = "https://mirrors.aliyun.com/pypi/packages/49/55/5ecf0dcaa0f2fbbd4420f7ef227ee3cb172e91e5fede9d0ecaddc43363b4/triton-3.6.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ef5523241e7d1abca00f1d240949eebdd7c673b005edbbce0aca95b8191f1d43" }, + { url = "https://mirrors.aliyun.com/pypi/packages/df/3d/9e7eee57b37c80cec63322c0231bb6da3cfe535a91d7a4d64896fcb89357/triton-3.6.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a17a5d5985f0ac494ed8a8e54568f092f7057ef60e1b0fa09d3fd1512064e803" }, + { url = "https://mirrors.aliyun.com/pypi/packages/48/db/56ee649cab5eaff4757541325aca81f52d02d4a7cd3506776cad2451e060/triton-3.6.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b3a97e8ed304dfa9bd23bb41ca04cdf6b2e617d5e782a8653d616037a5d537d" }, + { url = "https://mirrors.aliyun.com/pypi/packages/f6/56/6113c23ff46c00aae423333eb58b3e60bdfe9179d542781955a5e1514cb3/triton-3.6.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:46bd1c1af4b6704e554cad2eeb3b0a6513a980d470ccfa63189737340c7746a7" }, +] + +[[package]] +name = "typing-extensions" +version = "4.15.0" +source = { registry = "https://mirrors.aliyun.com/pypi/simple/" } +sdist = { url = "https://mirrors.aliyun.com/pypi/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548" }, +]