#!/usr/bin/env python3 """GPU-side receiver child for snapshot_link smoke test (CUDA mem).""" from __future__ import annotations import argparse import hashlib import json import sys import time from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src")) def _emit(d: dict) -> None: print(json.dumps(d), flush=True) def main(): ap = argparse.ArgumentParser() ap.add_argument("--host", required=True) ap.add_argument("--port", type=int, required=True) ap.add_argument("--ib", required=True) ap.add_argument("--max-bytes", type=int, required=True) ap.add_argument("--control-path", required=True) ap.add_argument("--sizes", required=True) ap.add_argument("--gpu-id", type=int, default=1, help="receiver GPU id") args = ap.parse_args() sizes = [int(s) for s in args.sizes.split(",")] try: import torch if not torch.cuda.is_available(): _emit({"event": "init-failed", "error": "cuda not available"}) sys.exit(2) torch.cuda.set_device(args.gpu_id) # allocate a GPU buffer of max_bytes recv_tensor = torch.zeros(args.max_bytes, dtype=torch.uint8, device=f"cuda:{args.gpu_id}") recv_ptr = recv_tensor.data_ptr() except Exception as e: import traceback _emit({"event": "init-failed", "error": repr(e), "tb": traceback.format_exc()}) sys.exit(2) # Spin up SnapshotPeer with NO internal recv buffer, then register our GPU tensor from agentic_pd_hybrid.snapshot_link import SnapshotPeer, SnapshotEndpoint try: peer = SnapshotPeer( host=args.host, port=args.port, ib_device=args.ib, receive_capacity_bytes=0, ) ret = peer.engine.register_memory(recv_ptr, args.max_bytes) if ret != 0: _emit({"event": "init-failed", "error": f"register_memory({hex(recv_ptr)}, {args.max_bytes}) ret={ret}"}) sys.exit(2) except Exception as e: import traceback _emit({"event": "init-failed", "error": repr(e), "tb": traceback.format_exc()}) sys.exit(2) endpoint = SnapshotEndpoint( session_id=peer.session_id, base_ptr=recv_ptr, capacity_bytes=args.max_bytes, ) Path(args.control_path).write_text(json.dumps({ "session_id": endpoint.session_id, "base_ptr": endpoint.base_ptr, "capacity_bytes": endpoint.capacity_bytes, "gpu_id": args.gpu_id, "ready": True, })) _emit({"event": "endpoint-ready", "session_id": endpoint.session_id, "base_ptr": endpoint.base_ptr, "capacity": endpoint.capacity_bytes, "gpu_id": args.gpu_id}) cp = Path(args.control_path) for size in sizes: if size > args.max_bytes: continue signal_path = cp.with_suffix(f".do{size}") ack_path = cp.with_suffix(f".ack{size}") deadline = time.time() + 120.0 while time.time() < deadline: if signal_path.exists(): break time.sleep(0.05) else: _emit({"event": "no-signal-timeout", "size": size}) continue try: payload = json.loads(signal_path.read_text()) expected_sha = payload["sha"] except Exception as e: _emit({"event": "signal-parse-error", "size": size, "err": repr(e)}) continue # Copy from GPU to CPU and hash torch.cuda.synchronize(args.gpu_id) host_bytes = bytes(recv_tensor[:size].cpu().numpy().tobytes()) recv_sha = hashlib.sha256(host_bytes).hexdigest() ok = recv_sha == expected_sha _emit({ "event": "verify", "size": size, "ok": ok, "expected_sha": expected_sha[:16], "got_sha": recv_sha[:16], "first8_recv": host_bytes[:8].hex(), "last8_recv": host_bytes[-8:].hex(), }) ack_path.write_text("done") peer.close() _emit({"event": "receiver-done"}) if __name__ == "__main__": main()