from __future__ import annotations import torch try: import triton import triton.language as tl except ImportError: # pragma: no cover - depends on local environment triton = None tl = None TRITON_AVAILABLE = triton is not None if TRITON_AVAILABLE: @triton.jit def vector_add_kernel( x_ptr, y_ptr, out_ptr, num_elements, block_size: tl.constexpr, ): pid = tl.program_id(axis=0) offsets = pid * block_size + tl.arange(0, block_size) mask = offsets < num_elements # TODO(student): load x and y using masked tl.load calls. # TODO(student): add the vectors. # TODO(student): write the result with tl.store. pass def triton_vector_add(x: torch.Tensor, y: torch.Tensor, block_size: int = 1024) -> torch.Tensor: """Student entrypoint for the Triton vector add task.""" if not TRITON_AVAILABLE: raise RuntimeError("Triton is not installed in this environment.") if x.shape != y.shape: raise ValueError(f"shape mismatch: {x.shape} vs {y.shape}") if not x.is_cuda or not y.is_cuda: raise ValueError("Triton kernels in this lab expect CUDA tensors.") raise NotImplementedError("TODO(student): launch vector_add_kernel and return the output tensor.")