kernel-lab/tasks/02_row_softmax/triton_skeleton.py

"""Workbook-local Triton notes for row softmax."""


def notes() -> str:
    return """
TODO(student):
1. Decide what one program instance owns: a whole row or a row tile.
2. Load a row with masking.
3. Compute row_max = max(x).
4. Compute exp(x - row_max), then the row sum.
5. Normalize and store.

Reflection:
- Why does numerical stability matter here more than in vector add?
- Where does extra memory traffic appear in a naive multi-kernel approach?
"""


if __name__ == "__main__":
    print(notes())