agentic-kvc/analysis/characterization/current_results/claim_matrix.json

[
  {
    "claim": "Batch 0 substrate audit is only partially complete for existing runs.",
    "needed_next": "Add request dispatch and finish/error timestamps to future replayer/proxy metrics.",
    "reviewer_risk": "Cannot use these runs to prove online per-session sequentiality.",
    "status": "partially_supported",
    "supporting_data": "metrics.jsonl lacks actual dispatch/finish timestamps in current artifacts."
  },
  {
    "claim": "Batch 1 workload shape can be characterized from formatted traces and metrics.",
    "needed_next": "Add cache-hit joined records for actual reuse decomposition.",
    "reviewer_risk": "Actual cache reuse decomposition needs cached_tokens joined with hash_ids.",
    "status": "supported_for_trace_shape",
    "supporting_data": "Full compact trace CPU summary in full_trace_summary.json: input p50/p90/p99 = 20k/87.9k/125.5k, output p50/p90/p99 = 80/811/6.6k, top 1% sessions hold 46.5% of input-token mass."
  },
  {
    "claim": "Static PD separation is worse than combined in existing 200-request GPU A/B.",
    "needed_next": "Refresh with PD matrix, multiple seeds, cudagraph-enabled methodology.",
    "reviewer_risk": "Legacy run has no per-stage TTFT breakdown and no step-level KV occupancy.",
    "status": "supported_by_existing_artifact",
    "supporting_data": "outputs/gpu_ab_combined vs outputs/gpu_ab_pdsep metrics.summary.json."
  },
  {
    "claim": "Elastic transfer-based migration does not improve high-contention 500-request run.",
    "needed_next": "Attribute whether failure is trigger quality, transfer overhead, or wrong load regime.",
    "reviewer_risk": "Existing metrics lack actual sequentiality proof and per-request transfer waterfall.",
    "status": "supported_by_existing_artifact",
    "supporting_data": "outputs/contention_16s_ts10 vs outputs/contention_16s_elastic metrics.summary.json and gpu_util.csv."
  },
  {
    "claim": "PD-colo prefill/decode interference is not yet directly proven by step-level data in this package.",
    "needed_next": "Run Batch 2 controlled same-worker/different-worker injection with step timestamps.",
    "reviewer_risk": "Cannot claim interference as causal without Batch 2.",
    "status": "not_yet_supported",
    "supporting_data": "No decode-step and prefill-overlap timestamp artifact found in summarized runs."
  },
  {
    "claim": "Session hot-spot residual imbalance is suggested but not fully attributed.",
    "needed_next": "Collect per-worker queue delay, session-to-worker map, and per-session token mass per worker.",
    "reviewer_risk": "GPU util imbalance alone is not enough to prove session hot-spot.",
    "status": "partially_supported",
    "supporting_data": "gpu_util.csv shows per-GPU mean-util imbalance in existing runs."
  },
  {
    "claim": "SRR is not measured by existing fixed-request runs.",
    "needed_next": "Implement Batch 4 Poisson session-arrival SRR sweep.",
    "reviewer_risk": "Latency-at-one-load cannot support sustainable throughput claim.",
    "status": "not_yet_supported",
    "supporting_data": "No arrival-rate sweep artifacts found."
  }
]