Files
agentic-pd-hybrid/scripts/snapshot_link_receiver_gpu.py
Claude Code Agent 7216507773 feat(snapshot): D→P RDMA Phase 1b — GPU pointer path verified
Confirms snapshot_link works for cuda device pointers, not just host
memory. Sender on cuda:0 pushes to receiver on cuda:1 via RDMA over
mlx5_60. All 5 sizes (16K, 1M, 16M, 64M, 256M) pass SHA verification.

  16 KB     8.3 ms   0.016 Gbps  (cold openSegment)
  1 MB      0.10 ms  87.6 Gbps
  16 MB     0.84 ms  159 Gbps
  64 MB     2.52 ms  213 Gbps
  256 MB    8.54 ms  251 Gbps    (~60% NDR400 line rate)

For Inferact-scale sessions (~50K tokens × ~80 KB layer-per-token =
~4 GB), this projects D→P transfer time at ~130 ms — within the
"reseed-savings" envelope sketched in design doc §3.2.

Files:
  scripts/snapshot_link_receiver_gpu.py
  scripts/smoke_snapshot_link_gpu.py

Next: SGLang scheduler integration for D-side dump + P-side ingest.
2026-05-13 00:59:43 +08:00

125 lines
4.1 KiB
Python

#!/usr/bin/env python3
"""GPU-side receiver child for snapshot_link smoke test (CUDA mem)."""
from __future__ import annotations
import argparse
import hashlib
import json
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src"))
def _emit(d: dict) -> None:
print(json.dumps(d), flush=True)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--host", required=True)
ap.add_argument("--port", type=int, required=True)
ap.add_argument("--ib", required=True)
ap.add_argument("--max-bytes", type=int, required=True)
ap.add_argument("--control-path", required=True)
ap.add_argument("--sizes", required=True)
ap.add_argument("--gpu-id", type=int, default=1, help="receiver GPU id")
args = ap.parse_args()
sizes = [int(s) for s in args.sizes.split(",")]
try:
import torch
if not torch.cuda.is_available():
_emit({"event": "init-failed", "error": "cuda not available"})
sys.exit(2)
torch.cuda.set_device(args.gpu_id)
# allocate a GPU buffer of max_bytes
recv_tensor = torch.zeros(args.max_bytes, dtype=torch.uint8, device=f"cuda:{args.gpu_id}")
recv_ptr = recv_tensor.data_ptr()
except Exception as e:
import traceback
_emit({"event": "init-failed", "error": repr(e), "tb": traceback.format_exc()})
sys.exit(2)
# Spin up SnapshotPeer with NO internal recv buffer, then register our GPU tensor
from agentic_pd_hybrid.snapshot_link import SnapshotPeer, SnapshotEndpoint
try:
peer = SnapshotPeer(
host=args.host,
port=args.port,
ib_device=args.ib,
receive_capacity_bytes=0,
)
ret = peer.engine.register_memory(recv_ptr, args.max_bytes)
if ret != 0:
_emit({"event": "init-failed", "error": f"register_memory({hex(recv_ptr)}, {args.max_bytes}) ret={ret}"})
sys.exit(2)
except Exception as e:
import traceback
_emit({"event": "init-failed", "error": repr(e), "tb": traceback.format_exc()})
sys.exit(2)
endpoint = SnapshotEndpoint(
session_id=peer.session_id,
base_ptr=recv_ptr,
capacity_bytes=args.max_bytes,
)
Path(args.control_path).write_text(json.dumps({
"session_id": endpoint.session_id,
"base_ptr": endpoint.base_ptr,
"capacity_bytes": endpoint.capacity_bytes,
"gpu_id": args.gpu_id,
"ready": True,
}))
_emit({"event": "endpoint-ready",
"session_id": endpoint.session_id,
"base_ptr": endpoint.base_ptr,
"capacity": endpoint.capacity_bytes,
"gpu_id": args.gpu_id})
cp = Path(args.control_path)
for size in sizes:
if size > args.max_bytes:
continue
signal_path = cp.with_suffix(f".do{size}")
ack_path = cp.with_suffix(f".ack{size}")
deadline = time.time() + 120.0
while time.time() < deadline:
if signal_path.exists():
break
time.sleep(0.05)
else:
_emit({"event": "no-signal-timeout", "size": size})
continue
try:
payload = json.loads(signal_path.read_text())
expected_sha = payload["sha"]
except Exception as e:
_emit({"event": "signal-parse-error", "size": size, "err": repr(e)})
continue
# Copy from GPU to CPU and hash
torch.cuda.synchronize(args.gpu_id)
host_bytes = bytes(recv_tensor[:size].cpu().numpy().tobytes())
recv_sha = hashlib.sha256(host_bytes).hexdigest()
ok = recv_sha == expected_sha
_emit({
"event": "verify",
"size": size,
"ok": ok,
"expected_sha": expected_sha[:16],
"got_sha": recv_sha[:16],
"first8_recv": host_bytes[:8].hex(),
"last8_recv": host_bytes[-8:].hex(),
})
ack_path.write_text("done")
peer.close()
_emit({"event": "receiver-done"})
if __name__ == "__main__":
main()