Confirms snapshot_link works for cuda device pointers, not just host memory. Sender on cuda:0 pushes to receiver on cuda:1 via RDMA over mlx5_60. All 5 sizes (16K, 1M, 16M, 64M, 256M) pass SHA verification. 16 KB 8.3 ms 0.016 Gbps (cold openSegment) 1 MB 0.10 ms 87.6 Gbps 16 MB 0.84 ms 159 Gbps 64 MB 2.52 ms 213 Gbps 256 MB 8.54 ms 251 Gbps (~60% NDR400 line rate) For Inferact-scale sessions (~50K tokens × ~80 KB layer-per-token = ~4 GB), this projects D→P transfer time at ~130 ms — within the "reseed-savings" envelope sketched in design doc §3.2. Files: scripts/snapshot_link_receiver_gpu.py scripts/smoke_snapshot_link_gpu.py Next: SGLang scheduler integration for D-side dump + P-side ingest.
125 lines
4.1 KiB
Python
125 lines
4.1 KiB
Python
#!/usr/bin/env python3
|
|
"""GPU-side receiver child for snapshot_link smoke test (CUDA mem)."""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import hashlib
|
|
import json
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src"))
|
|
|
|
|
|
def _emit(d: dict) -> None:
|
|
print(json.dumps(d), flush=True)
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--host", required=True)
|
|
ap.add_argument("--port", type=int, required=True)
|
|
ap.add_argument("--ib", required=True)
|
|
ap.add_argument("--max-bytes", type=int, required=True)
|
|
ap.add_argument("--control-path", required=True)
|
|
ap.add_argument("--sizes", required=True)
|
|
ap.add_argument("--gpu-id", type=int, default=1, help="receiver GPU id")
|
|
args = ap.parse_args()
|
|
|
|
sizes = [int(s) for s in args.sizes.split(",")]
|
|
|
|
try:
|
|
import torch
|
|
if not torch.cuda.is_available():
|
|
_emit({"event": "init-failed", "error": "cuda not available"})
|
|
sys.exit(2)
|
|
torch.cuda.set_device(args.gpu_id)
|
|
# allocate a GPU buffer of max_bytes
|
|
recv_tensor = torch.zeros(args.max_bytes, dtype=torch.uint8, device=f"cuda:{args.gpu_id}")
|
|
recv_ptr = recv_tensor.data_ptr()
|
|
except Exception as e:
|
|
import traceback
|
|
_emit({"event": "init-failed", "error": repr(e), "tb": traceback.format_exc()})
|
|
sys.exit(2)
|
|
|
|
# Spin up SnapshotPeer with NO internal recv buffer, then register our GPU tensor
|
|
from agentic_pd_hybrid.snapshot_link import SnapshotPeer, SnapshotEndpoint
|
|
try:
|
|
peer = SnapshotPeer(
|
|
host=args.host,
|
|
port=args.port,
|
|
ib_device=args.ib,
|
|
receive_capacity_bytes=0,
|
|
)
|
|
ret = peer.engine.register_memory(recv_ptr, args.max_bytes)
|
|
if ret != 0:
|
|
_emit({"event": "init-failed", "error": f"register_memory({hex(recv_ptr)}, {args.max_bytes}) ret={ret}"})
|
|
sys.exit(2)
|
|
except Exception as e:
|
|
import traceback
|
|
_emit({"event": "init-failed", "error": repr(e), "tb": traceback.format_exc()})
|
|
sys.exit(2)
|
|
|
|
endpoint = SnapshotEndpoint(
|
|
session_id=peer.session_id,
|
|
base_ptr=recv_ptr,
|
|
capacity_bytes=args.max_bytes,
|
|
)
|
|
Path(args.control_path).write_text(json.dumps({
|
|
"session_id": endpoint.session_id,
|
|
"base_ptr": endpoint.base_ptr,
|
|
"capacity_bytes": endpoint.capacity_bytes,
|
|
"gpu_id": args.gpu_id,
|
|
"ready": True,
|
|
}))
|
|
_emit({"event": "endpoint-ready",
|
|
"session_id": endpoint.session_id,
|
|
"base_ptr": endpoint.base_ptr,
|
|
"capacity": endpoint.capacity_bytes,
|
|
"gpu_id": args.gpu_id})
|
|
|
|
cp = Path(args.control_path)
|
|
for size in sizes:
|
|
if size > args.max_bytes:
|
|
continue
|
|
signal_path = cp.with_suffix(f".do{size}")
|
|
ack_path = cp.with_suffix(f".ack{size}")
|
|
deadline = time.time() + 120.0
|
|
while time.time() < deadline:
|
|
if signal_path.exists():
|
|
break
|
|
time.sleep(0.05)
|
|
else:
|
|
_emit({"event": "no-signal-timeout", "size": size})
|
|
continue
|
|
try:
|
|
payload = json.loads(signal_path.read_text())
|
|
expected_sha = payload["sha"]
|
|
except Exception as e:
|
|
_emit({"event": "signal-parse-error", "size": size, "err": repr(e)})
|
|
continue
|
|
|
|
# Copy from GPU to CPU and hash
|
|
torch.cuda.synchronize(args.gpu_id)
|
|
host_bytes = bytes(recv_tensor[:size].cpu().numpy().tobytes())
|
|
recv_sha = hashlib.sha256(host_bytes).hexdigest()
|
|
ok = recv_sha == expected_sha
|
|
_emit({
|
|
"event": "verify",
|
|
"size": size,
|
|
"ok": ok,
|
|
"expected_sha": expected_sha[:16],
|
|
"got_sha": recv_sha[:16],
|
|
"first8_recv": host_bytes[:8].hex(),
|
|
"last8_recv": host_bytes[-8:].hex(),
|
|
})
|
|
ack_path.write_text("done")
|
|
|
|
peer.close()
|
|
_emit({"event": "receiver-done"})
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|