diff --git a/analysis/mb5_pd_ablation/fig1.json b/analysis/mb5_pd_ablation/fig1.json new file mode 100644 index 0000000..9907313 --- /dev/null +++ b/analysis/mb5_pd_ablation/fig1.json @@ -0,0 +1 @@ +[{"name": "fig1_p2048_colo_8C-proxy_rep1", "arm": "colo", "n": 400, "req": 400, "e2e_p50": 2.555279068008531, "e2e_p90": 3.4275179531075994, "e2e_p99": 5.231042563370427, "e2e_mean": 2.533418247078953, "ttft_p90": 1.059612769272644, "tpot_p99": 0.016176813139529973, "tps": 488.9617809203525, "wall": 209.42332099506166, "pu": 35.58080808080808, "du": null, "apc": 0.21875}, {"name": "fig1_p2048_pd2_2P+6D_rep1", "arm": "2P+6D", "n": 400, "req": 400, "e2e_p50": 1.9831702220253646, "e2e_p90": 2.408317962428555, "e2e_p99": 3.386159433723659, "e2e_mean": 2.080101182157232, "ttft_p90": 1.0395420689717867, "tpot_p99": 0.005522104923522062, "tps": 530.468328273858, "wall": 193.0369723169133, "pu": 46.917582417582416, "du": 48.58058608058608, "apc": 0.21875}, {"name": "fig1_p2048_pd6_6P+2D_rep1", "arm": "6P+2D", "n": 400, "req": 400, "e2e_p50": 2.3523911059601232, "e2e_p90": 2.7588249894673935, "e2e_p99": 3.603395572100996, "e2e_mean": 2.4113844815955963, "ttft_p90": 0.7664745874935761, "tpot_p99": 0.009031482047424195, "tps": 488.72074894811755, "wall": 209.52660639106762, "pu": 14.218855218855218, "du": 90.16161616161617, "apc": 0.21875}, {"name": "fig1_p2048_pd_4P+4D_rep1", "arm": "4P+4D", "n": 400, "req": 400, "e2e_p50": 2.0429230239242315, "e2e_p90": 2.2362921022577216, "e2e_p99": 2.9135718233766945, "e2e_mean": 2.095764471256989, "ttft_p90": 0.7477957331226207, "tpot_p99": 0.006522443569635094, "tps": 527.6226407393885, "wall": 194.07810069806874, "pu": 23.669444444444444, "du": 65.01666666666667, "apc": 0.21875}, {"name": "fig1_p4096_colo_8C-proxy_rep1", "arm": "colo", "n": 400, "req": 400, "e2e_p50": 2.2734450550633483, "e2e_p90": 3.0487391501781533, "e2e_p99": 4.6287568241392725, "e2e_mean": 2.2661249774988392, "ttft_p90": 0.713115519611165, "tpot_p99": 0.014206131751343207, "tps": 520.0122707724117, "wall": 196.9184301130008, "pu": 34.659946236559136, "du": null, "apc": 0.4375}, {"name": "fig1_p4096_pd2_2P+6D_rep1", "arm": "2P+6D", "n": 400, "req": 400, "e2e_p50": 1.8291561939986423, "e2e_p90": 2.2601341274916207, "e2e_p99": 3.2612337802827804, "e2e_mean": 1.9412393476464787, "ttft_p90": 0.8800801524659628, "tpot_p99": 0.005551423189517877, "tps": 552.1771045541858, "wall": 185.44774702796713, "pu": 38.293103448275865, "du": 49.05747126436781, "apc": 0.4375}, {"name": "fig1_p4096_pd6_6P+2D_rep1", "arm": "6P+2D", "n": 400, "req": 400, "e2e_p50": 2.2462158624548465, "e2e_p90": 2.64611944751814, "e2e_p99": 3.4558432800625423, "e2e_mean": 2.2963840230583448, "ttft_p90": 0.711689621617552, "tpot_p99": 0.008991341477657172, "tps": 502.68863490365385, "wall": 203.70462526893243, "pu": 10.604166666666666, "du": 88.75520833333333, "apc": 0.4375}, {"name": "fig1_p4096_pd_4P+4D_rep1", "arm": "4P+4D", "n": 400, "req": 400, "e2e_p50": 1.8967114535043947, "e2e_p90": 2.18965909028193, "e2e_p99": 2.9952263131842467, "e2e_mean": 1.9644193770724814, "ttft_p90": 0.7082089037983679, "tpot_p99": 0.006562968838594706, "tps": 548.1341659016695, "wall": 186.81557613098994, "pu": 19.133522727272727, "du": 67.32102272727273, "apc": 0.4375}, {"name": "fig1_p512_colo_8C-proxy_rep1", "arm": "colo", "n": 400, "req": 400, "e2e_p50": 2.6645242294762284, "e2e_p90": 3.61182137549622, "e2e_p99": 5.3448568455432515, "e2e_mean": 2.6195515102380886, "ttft_p90": 1.112424884561915, "tpot_p99": 0.01658212741880276, "tps": 475.13983834945145, "wall": 215.5155003539985, "pu": 36.375, "du": null, "apc": 0.0546875}, {"name": "fig1_p512_pd2_2P+6D_rep1", "arm": "2P+6D", "n": 400, "req": 400, "e2e_p50": 2.103162212530151, "e2e_p90": 2.4044417485478347, "e2e_p99": 3.3844505867047685, "e2e_mean": 2.1739998702009324, "ttft_p90": 1.0342383790179164, "tpot_p99": 0.00550937183462905, "tps": 517.9536953932844, "wall": 197.7010704060085, "pu": 55.854838709677416, "du": 46.65232974910394, "apc": 0.0546875}, {"name": "fig1_p512_pd6_6P+2D_rep1", "arm": "6P+2D", "n": 400, "req": 400, "e2e_p50": 2.3785464115208015, "e2e_p90": 2.7350583746214396, "e2e_p99": 3.4022513560648044, "e2e_mean": 2.445902353489655, "ttft_p90": 0.7912747941561975, "tpot_p99": 0.009167485321045615, "tps": 482.02537525954966, "wall": 212.436948874034, "pu": 17.535353535353536, "du": 91.0959595959596, "apc": 0.0546875}, {"name": "fig1_p512_pd_4P+4D_rep1", "arm": "4P+4D", "n": 400, "req": 400, "e2e_p50": 2.153953853994608, "e2e_p90": 2.2966971063637174, "e2e_p99": 3.044859012498054, "e2e_mean": 2.1845501415853503, "ttft_p90": 0.7954113337909803, "tpot_p99": 0.006337697780334992, "tps": 512.0369567409949, "wall": 199.9855648149969, "pu": 26.21276595744681, "du": 62.944148936170215, "apc": 0.0546875}, {"name": "fig1_p6144_colo_8C-proxy_rep1", "arm": "colo", "n": 400, "req": 400, "e2e_p50": 2.0121258909930475, "e2e_p90": 2.7345886924420504, "e2e_p99": 3.9082167004665824, "e2e_mean": 2.0664054725714958, "ttft_p90": 0.5843230376602151, "tpot_p99": 0.01391471299074371, "tps": 547.3893798822313, "wall": 187.0697601441061, "pu": 34.44744318181818, "du": null, "apc": 0.65625}, {"name": "fig1_p6144_pd2_2P+6D_rep1", "arm": "2P+6D", "n": 400, "req": 400, "e2e_p50": 1.6655802099849097, "e2e_p90": 2.1824162190663636, "e2e_p99": 3.4232188416458658, "e2e_mean": 1.811165077216283, "ttft_p90": 0.7816491658217275, "tpot_p99": 0.005717973897763179, "tps": 574.0385687244718, "wall": 178.38522632292006, "pu": 21.53012048192771, "du": 51.61244979919679, "apc": 0.65625}, {"name": "fig1_p6144_pd6_6P+2D_rep1", "arm": "6P+2D", "n": 400, "req": 400, "e2e_p50": 2.104133589542471, "e2e_p90": 2.6115266072680243, "e2e_p99": 3.388375885724087, "e2e_mean": 2.1801070072923903, "ttft_p90": 0.7013537635677495, "tpot_p99": 0.00888146581120022, "tps": 521.6799713459584, "wall": 196.2889235249022, "pu": 6.709677419354839, "du": 88.70430107526882, "apc": 0.65625}, {"name": "fig1_p6144_pd_4P+4D_rep1", "arm": "4P+4D", "n": 399, "req": 400, "e2e_p50": 1.748427166021429, "e2e_p90": 2.1873498664470388, "e2e_p99": 3.1015963148581767, "e2e_mean": 1.8389387578233134, "ttft_p90": 0.6869595416123048, "tpot_p99": 0.006769578668425845, "tps": 569.2010105657868, "wall": 179.45154366199858, "pu": 13.089285714285714, "du": 68.10119047619048, "apc": 0.65625}, {"name": "fig1_p7168_colo_8C-proxy_rep1", "arm": "colo", "n": 400, "req": 400, "e2e_p50": 1.9153751800186, "e2e_p90": 2.6549224384711154, "e2e_p99": 3.861135128394235, "e2e_mean": 1.958507129738573, "ttft_p90": 0.5802406982053072, "tpot_p99": 0.013510460370503293, "tps": 564.6374955198476, "wall": 181.35529576498084, "pu": 33.595588235294116, "du": null, "apc": 0.765625}, {"name": "fig1_p7168_pd2_2P+6D_rep1", "arm": "2P+6D", "n": 400, "req": 400, "e2e_p50": 1.5798494375194423, "e2e_p90": 2.1327995942323468, "e2e_p99": 3.3373043064947687, "e2e_mean": 1.7140900615448482, "ttft_p90": 0.727025477041025, "tpot_p99": 0.005695829117182167, "tps": 593.7272444367894, "wall": 172.46976782602724, "pu": 18.30246913580247, "du": 51.51234567901235, "apc": 0.765625}, {"name": "fig1_p7168_pd6_6P+2D_rep1", "arm": "6P+2D", "n": 400, "req": 400, "e2e_p50": 2.1042530980193987, "e2e_p90": 2.584451850724873, "e2e_p99": 3.5201327085494967, "e2e_mean": 2.164814479558263, "ttft_p90": 0.6972717452561484, "tpot_p99": 0.009263812077688232, "tps": 527.1747330056295, "wall": 194.24299684504513, "pu": 5.574275362318841, "du": 86.96195652173913, "apc": 0.765625}, {"name": "fig1_p7168_pd_4P+4D_rep1", "arm": "4P+4D", "n": 400, "req": 400, "e2e_p50": 1.621718394511845, "e2e_p90": 2.1209186871652492, "e2e_p99": 2.9950801801832854, "e2e_mean": 1.7384027546702419, "ttft_p90": 0.681954695621971, "tpot_p99": 0.006750334063133991, "tps": 587.6633213260067, "wall": 174.24943208799232, "pu": 9.521341463414634, "du": 70.10060975609755, "apc": 0.765625}, {"name": "fig1_p7680_colo_8C-proxy_rep1", "arm": "colo", "n": 400, "req": 400, "e2e_p50": 1.805449907493312, "e2e_p90": 2.2804414638434545, "e2e_p99": 3.2436008435313126, "e2e_mean": 1.8311505301928264, "ttft_p90": 0.5787636383553035, "tpot_p99": 0.01186512264145045, "tps": 586.5851277673049, "wall": 174.5697174249217, "pu": 33.792682926829265, "du": null, "apc": 0.8203125}, {"name": "fig1_p7680_pd2_2P+6D_rep1", "arm": "2P+6D", "n": 400, "req": 400, "e2e_p50": 1.5353662585257553, "e2e_p90": 2.0703843546216376, "e2e_p99": 3.239132529124615, "e2e_mean": 1.6632775271314313, "ttft_p90": 0.684448261326179, "tpot_p99": 0.005772996509146383, "tps": 601.1715360533158, "wall": 170.3340791419614, "pu": 13.35625, "du": 52.83125, "apc": 0.8203125}, {"name": "fig1_p7680_pd6_6P+2D_rep1", "arm": "6P+2D", "n": 400, "req": 400, "e2e_p50": 1.935218213009648, "e2e_p90": 2.540124618355185, "e2e_p99": 3.5381310180295236, "e2e_mean": 2.0565583147658617, "ttft_p90": 0.6883091802941635, "tpot_p99": 0.009528811932669258, "tps": 539.8052979764931, "wall": 189.698027017992, "pu": 5.219101123595506, "du": 90.43820224719101, "apc": 0.8203125}, {"name": "fig1_p7680_pd_4P+4D_rep1", "arm": "4P+4D", "n": 400, "req": 400, "e2e_p50": 1.6052846949896775, "e2e_p90": 2.16270094960928, "e2e_p99": 3.0139038068498483, "e2e_mean": 1.7113545338093537, "ttft_p90": 0.6793354631052353, "tpot_p99": 0.006723990285321704, "tps": 594.143824556681, "wall": 172.34884175797924, "pu": 8.765432098765432, "du": 69.9320987654321, "apc": 0.8203125}] diff --git a/analysis/mb5_pd_ablation/fig2.json b/analysis/mb5_pd_ablation/fig2.json new file mode 100644 index 0000000..6afed7a --- /dev/null +++ b/analysis/mb5_pd_ablation/fig2.json @@ -0,0 +1 @@ +[{"name": "fig2_in16384_out128_colo_8C-proxy_rep1", "arm": "colo", "n": 400, "req": 400, "e2e_p50": 1.6966288615367375, "e2e_p90": 3.142477283347398, "e2e_p99": 4.572902428222587, "e2e_mean": 1.8778391962422756, "ttft_p90": 1.528641331603285, "tpot_p99": 0.02700975849941244, "tps": 293.2414474758892, "wall": 174.600147560006, "pu": 30.718373493975903, "du": null, "apc": 0.73828125}, {"name": "fig2_in16384_out128_pd2_2P+6D_rep1", "arm": "2P+6D", "n": 400, "req": 400, "e2e_p50": 1.5746862735250033, "e2e_p90": 3.6393908081925486, "e2e_p99": 6.788023261578052, "e2e_mean": 2.1054475268305395, "ttft_p90": 2.8525443844730045, "tpot_p99": 0.007377313970786145, "tps": 272.743216323279, "wall": 187.72235911199823, "pu": 54.79545454545455, "du": 28.009469696969695, "apc": 0.73828125}, {"name": "fig2_in16384_out128_pd4_4P+4D_rep1", "arm": "4P+4D", "n": 400, "req": 400, "e2e_p50": 1.2106705509941094, "e2e_p90": 2.6971542384708305, "e2e_p99": 4.516567796494346, "e2e_mean": 1.6196880877471995, "ttft_p90": 1.8512291587772782, "tpot_p99": 0.007638815456312003, "tps": 307.7022111731225, "wall": 166.3946443699533, "pu": 28.876582278481013, "du": 47.36708860759494, "apc": 0.73828125}, {"name": "fig2_in16384_out128_pd6_6P+2D_rep1", "arm": "6P+2D", "n": 400, "req": 400, "e2e_p50": 1.3666948495083489, "e2e_p90": 2.656380763812923, "e2e_p99": 4.434802388340466, "e2e_mean": 1.6502306728763505, "ttft_p90": 1.7600484249996953, "tpot_p99": 0.009977159781425488, "tps": 307.56190002160906, "wall": 166.47055437101517, "pu": 21.023206751054854, "du": 70.51898734177215, "apc": 0.73828125}, {"name": "fig2_in2048_out2048_colo_8C-proxy_rep1", "arm": "colo", "n": 400, "req": 400, "e2e_p50": 11.900513574946672, "e2e_p90": 14.623661132121924, "e2e_p99": 17.82160759311984, "e2e_mean": 12.263538628305833, "ttft_p90": 0.13757785173365847, "tpot_p99": 0.00867108589104906, "tps": 1109.2196116287032, "wall": 738.5372485410189, "pu": 54.30869565217391, "du": null, "apc": 0.65625}, {"name": "fig2_in2048_out2048_pd2_2P+6D_rep1", "arm": "2P+6D", "n": 400, "req": 400, "e2e_p50": 11.239029973454308, "e2e_p90": 12.24954682419775, "e2e_p99": 12.908233385497004, "e2e_mean": 11.36166481389053, "ttft_p90": 0.1597270941361785, "tpot_p99": 0.006243306631126823, "tps": 1159.3604844966112, "wall": 706.5964477439411, "pu": 1.9437689969604863, "du": 86.7517730496454, "apc": 0.65625}, {"name": "fig2_in2048_out2048_pd4_4P+4D_rep1", "arm": "4P+4D", "n": 400, "req": 400, "e2e_p50": 12.676327208988369, "e2e_p90": 13.124083981337025, "e2e_p99": 13.789963249830762, "e2e_mean": 12.521095666602777, "ttft_p90": 0.1668232314521447, "tpot_p99": 0.006606968528777976, "tps": 1070.1894910008175, "wall": 765.4719158509979, "pu": 0.5945378151260504, "du": 92.65546218487395, "apc": 0.65625}, {"name": "fig2_in2048_out2048_pd6_6P+2D_rep1", "arm": "6P+2D", "n": 400, "req": 400, "e2e_p50": 15.628125407500193, "e2e_p90": 16.762494630913714, "e2e_p99": 17.865684803246978, "e2e_mean": 15.437463862727746, "ttft_p90": 0.1816938084899448, "tpot_p99": 0.008672833048181654, "tps": 897.4033352505149, "wall": 912.8559788239654, "pu": 0.2651869158878505, "du": 98.21028037383178, "apc": 0.65625}, {"name": "fig2_in32768_out64_colo_8C-proxy_rep1", "arm": "colo", "n": 400, "req": 400, "e2e_p50": 2.8777761650271714, "e2e_p90": 7.02909248394426, "e2e_p99": 12.042338756883982, "e2e_mean": 3.6056005006073972, "ttft_p90": 4.589756254199893, "tpot_p99": 0.15461345151164715, "tps": 97.4559162735194, "wall": 262.6828722039936, "pu": 36.19410569105691, "du": null, "apc": 0.73828125}, {"name": "fig2_in32768_out64_pd2_2P+6D_rep1", "arm": "2P+6D", "n": 378, "req": 400, "e2e_p50": 5.744399158516899, "e2e_p90": 17.501065154711252, "e2e_p99": 431.9109102533118, "e2e_mean": 24.76107206362763, "ttft_p90": 17.079777074372398, "tpot_p99": 0.008512455084701146, "tps": 17.72103702655267, "wall": 1365.1571273030713, "pu": 22.84921875, "du": 2.06796875, "apc": 0.8334464289939819}, {"name": "fig2_in32768_out64_pd4_4P+4D_rep1", "arm": "4P+4D", "n": 400, "req": 400, "e2e_p50": 2.331694360531401, "e2e_p90": 8.168041506421288, "e2e_p99": 16.819581468357928, "e2e_mean": 4.067478344673291, "ttft_p90": 7.7613852798473095, "tpot_p99": 0.008991237692276223, "tps": 89.86030789358054, "wall": 284.8866268109996, "pu": 53.20335820895522, "du": 15.065298507462687, "apc": 0.73828125}, {"name": "fig2_in32768_out64_pd6_6P+2D_rep1", "arm": "6P+2D", "n": 400, "req": 400, "e2e_p50": 1.881187686463818, "e2e_p90": 6.823026831133758, "e2e_p99": 12.242816790416828, "e2e_mean": 3.2513622655556538, "ttft_p90": 6.349652938055806, "tpot_p99": 0.011577233054050565, "tps": 105.74545516262978, "wall": 242.09078263107222, "pu": 42.801169590643276, "du": 31.153508771929825, "apc": 0.73828125}, {"name": "fig2_in4096_out1024_colo_8C-proxy_rep1", "arm": "colo", "n": 400, "req": 400, "e2e_p50": 6.376699871034361, "e2e_p90": 8.016901113302447, "e2e_p99": 9.421493258888365, "e2e_mean": 6.472622742803069, "ttft_p90": 0.26107478952035307, "tpot_p99": 0.009009339244909244, "tps": 964.4334957573764, "wall": 424.7052822220139, "pu": 50.12248743718593, "du": null, "apc": 0.65625}, {"name": "fig2_in4096_out1024_pd2_2P+6D_rep1", "arm": "2P+6D", "n": 400, "req": 400, "e2e_p50": 5.711871185048949, "e2e_p90": 6.152766603662167, "e2e_p99": 6.618846287685439, "e2e_mean": 5.7896922694568635, "ttft_p90": 0.2993865112075582, "tpot_p99": 0.006226416155723225, "tps": 1026.860822805463, "wall": 398.88560445897747, "pu": 3.8877005347593583, "du": 83.79411764705883, "apc": 0.65625}, {"name": "fig2_in4096_out1024_pd4_4P+4D_rep1", "arm": "4P+4D", "n": 400, "req": 400, "e2e_p50": 6.441164412011858, "e2e_p90": 6.755943879298865, "e2e_p99": 7.246829778881511, "e2e_mean": 6.4186325767840025, "ttft_p90": 0.30361198947066437, "tpot_p99": 0.0066874305859860395, "tps": 948.5732059117771, "wall": 431.8064198390348, "pu": 2.6683168316831685, "du": 88.01608910891089, "apc": 0.65625}, {"name": "fig2_in4096_out1024_pd6_6P+2D_rep1", "arm": "6P+2D", "n": 400, "req": 400, "e2e_p50": 8.175307728059124, "e2e_p90": 8.772436089895201, "e2e_p99": 9.845743471009191, "e2e_mean": 8.103695073690615, "ttft_p90": 0.3135268738726154, "tpot_p99": 0.008783244535960586, "tps": 795.3463509805472, "wall": 514.9957619030029, "pu": 1.2988980716253444, "du": 95.7107438016529, "apc": 0.65625}, {"name": "fig2_in8192_out512_colo_8C-proxy_rep1", "arm": "colo", "n": 400, "req": 400, "e2e_p50": 3.569815175491385, "e2e_p90": 4.748414856137243, "e2e_p99": 6.3728869484120505, "e2e_mean": 3.6905484462657476, "ttft_p90": 0.5787142073037103, "tpot_p99": 0.011623186658178922, "tps": 749.35951206451, "wall": 273.3000605220441, "pu": 43.21484375, "du": null, "apc": 0.7109375}, {"name": "fig2_in8192_out512_pd2_2P+6D_rep1", "arm": "2P+6D", "n": 400, "req": 400, "e2e_p50": 3.0584998495178297, "e2e_p90": 3.546729282848538, "e2e_p99": 4.885626904441742, "e2e_mean": 3.183082153094583, "ttft_p90": 0.6684098902973354, "tpot_p99": 0.006093405278323496, "tps": 801.0277344160907, "wall": 255.67154693999328, "pu": 14.795833333333333, "du": 70.95138888888889, "apc": 0.7109375}, {"name": "fig2_in8192_out512_pd4_4P+4D_rep1", "arm": "4P+4D", "n": 400, "req": 400, "e2e_p50": 3.3473425395786762, "e2e_p90": 3.8297921021352526, "e2e_p99": 4.728309926969231, "e2e_mean": 3.4304884171887533, "ttft_p90": 0.647590011463035, "tpot_p99": 0.0067240075080280985, "tps": 768.7152035389245, "wall": 266.41856315208133, "pu": 7.96, "du": 83.674, "apc": 0.7109375}, {"name": "fig2_in8192_out512_pd6_6P+2D_rep1", "arm": "6P+2D", "n": 400, "req": 400, "e2e_p50": 4.395662502502091, "e2e_p90": 4.981798351998441, "e2e_p99": 6.572449592349585, "e2e_mean": 4.434228266531718, "ttft_p90": 0.6629501176299528, "tpot_p99": 0.009418493171829412, "tps": 645.2526253784575, "wall": 317.3950665909797, "pu": 5.468680089485459, "du": 94.43959731543625, "apc": 0.7109375}] diff --git a/analysis/mb5_pd_ablation/fig3.json b/analysis/mb5_pd_ablation/fig3.json new file mode 100644 index 0000000..b85c9ed --- /dev/null +++ b/analysis/mb5_pd_ablation/fig3.json @@ -0,0 +1 @@ +[{"name": "fig3_N16_colo_8C-proxy_rep1", "arm": "colo", "n": 720, "req": 720, "e2e_p50": 2.273988057495444, "e2e_p90": 3.22202166619245, "e2e_p99": 4.154007889915082, "e2e_mean": 2.349281024678405, "ttft_p90": 0.5880337386857718, "tpot_p99": 0.013491011632263985, "tps": 1007.0977376198009, "wall": 183.02096521001658, "pu": 53.47674418604651, "du": null, "apc": 0.7109375}, {"name": "fig3_N16_pd2_2P+6D_rep1", "arm": "2P+6D", "n": 720, "req": 720, "e2e_p50": 1.8571838534990093, "e2e_p90": 2.6877894366974946, "e2e_p99": 5.491437585417586, "e2e_mean": 2.106012088546211, "ttft_p90": 1.098052769700007, "tpot_p99": 0.00736965422303468, "tps": 1083.2982957028535, "wall": 170.14704142999835, "pu": 43.6625, "du": 80.82708333333333, "apc": 0.7109375}, {"name": "fig3_N16_pd4_4P+4D_rep1", "arm": "4P+4D", "n": 720, "req": 720, "e2e_p50": 1.9961085925024236, "e2e_p90": 2.6387570307022545, "e2e_p99": 4.1306983676511875, "e2e_mean": 2.1104672683014645, "ttft_p90": 0.751794431997405, "tpot_p99": 0.008509515762943705, "tps": 1093.4268194900712, "wall": 168.57095208800456, "pu": 19.946202531645568, "du": 98.21518987341773, "apc": 0.7109375}, {"name": "fig3_N16_pd6_6P+2D_rep1", "arm": "6P+2D", "n": 720, "req": 720, "e2e_p50": 3.0437641629832797, "e2e_p90": 3.5130674026062483, "e2e_p99": 5.287108179212371, "e2e_mean": 3.041667996072081, "ttft_p90": 0.7129690356960056, "tpot_p99": 0.013359745218868221, "tps": 871.970575501767, "wall": 211.38327964098426, "pu": 10.943333333333333, "du": 96.86, "apc": 0.7109375}, {"name": "fig3_N32_colo_8C-proxy_rep1", "arm": "colo", "n": 1320, "req": 1320, "e2e_p50": 3.270167972994386, "e2e_p90": 4.661326845278381, "e2e_p99": 6.208903694198525, "e2e_mean": 3.2551948325128417, "ttft_p90": 0.9038233671861248, "tpot_p99": 0.01838023195033048, "tps": 1580.8633971808533, "wall": 213.7566095860093, "pu": 66.3625, "du": null, "apc": 0.7109375}, {"name": "fig3_N32_pd2_2P+6D_rep1", "arm": "2P+6D", "n": 1320, "req": 1320, "e2e_p50": 2.5704439695036854, "e2e_p90": 6.883691897706018, "e2e_p99": 17.488955901044665, "e2e_mean": 3.761722041763344, "ttft_p90": 5.035864923497138, "tpot_p99": 0.010349134326673354, "tps": 1479.6196585494329, "wall": 228.38301589699404, "pu": 56.61682242990654, "du": 82.69626168224299, "apc": 0.7109375}, {"name": "fig3_N32_pd4_4P+4D_rep1", "arm": "4P+4D", "n": 1320, "req": 1320, "e2e_p50": 3.1077146044990513, "e2e_p90": 3.770394044389833, "e2e_p99": 6.062736103993954, "e2e_mean": 3.2164792430455265, "ttft_p90": 1.0083121383970153, "tpot_p99": 0.011962187868884226, "tps": 1608.9998823250762, "wall": 210.01866048100055, "pu": 29.68877551020408, "du": 94.66836734693878, "apc": 0.7109375}, {"name": "fig3_N32_pd6_6P+2D_rep1", "arm": "6P+2D", "n": 1320, "req": 1320, "e2e_p50": 4.650343854504172, "e2e_p90": 5.231803922989639, "e2e_p99": 7.026731992097026, "e2e_mean": 4.642735430796385, "ttft_p90": 0.8066709822014674, "tpot_p99": 0.018365715299701022, "tps": 1244.6646057403061, "wall": 271.4948255470081, "pu": 17.44750656167979, "du": 97.92125984251969, "apc": 0.7109375}, {"name": "fig3_N64_colo_8C-proxy_rep1", "arm": "colo", "n": 2640, "req": 2640, "e2e_p50": 4.616785284990328, "e2e_p90": 6.662268486898392, "e2e_p99": 9.11107949850848, "e2e_mean": 4.8815010888681805, "ttft_p90": 1.4007563413004391, "tpot_p99": 0.028896959955475372, "tps": 2431.5635136762567, "wall": 277.9446213100164, "pu": 80.68076923076923, "du": null, "apc": 0.7109375}, {"name": "fig3_N64_pd2_2P+6D_rep1", "arm": "2P+6D", "n": 2639, "req": 2640, "e2e_p50": 11.69906226900639, "e2e_p90": 31.074856758594986, "e2e_p99": 33.94995162280335, "e2e_mean": 14.142758539245058, "ttft_p90": 29.560715823207286, "tpot_p99": 0.013875843108832534, "tps": 698.1370406736987, "wall": 967.6953959469975, "pu": 43.86363636363637, "du": 45.14781966001478, "apc": 0.4577210235884805}, {"name": "fig3_N64_pd4_4P+4D_rep1", "arm": "4P+4D", "n": 2601, "req": 2640, "e2e_p50": 4.077710573998047, "e2e_p90": 16.441288907983107, "e2e_p99": 385.4163983319886, "e2e_mean": 13.444935590261034, "ttft_p90": 14.423547562997555, "tpot_p99": 0.0182731644510675, "tps": 864.9622710798936, "wall": 769.8092995070037, "pu": 19.74375, "du": 51.03541666666667, "apc": 0.014043434389389917}, {"name": "fig3_N64_pd6_6P+2D_rep1", "arm": "6P+2D", "n": 2640, "req": 2640, "e2e_p50": 7.523401059006574, "e2e_p90": 7.9631150633882495, "e2e_p99": 11.943453508106181, "e2e_mean": 7.532413133775613, "ttft_p90": 0.9084304597956361, "tpot_p99": 0.028826642419567665, "tps": 1712.6176644681352, "wall": 394.62398060099804, "pu": 21.95225225225225, "du": 98.26216216216216, "apc": 0.7109375}, {"name": "fig3_N8_colo_8C-proxy_rep1", "arm": "colo", "n": 360, "req": 360, "e2e_p50": 1.9761100795149105, "e2e_p90": 2.687890137603972, "e2e_p99": 3.689032165001845, "e2e_mean": 2.0329397837324197, "ttft_p90": 0.5798680250212783, "tpot_p99": 0.012779506407645046, "tps": 564.0943793942257, "wall": 163.3769159319927, "pu": 31.852272727272727, "du": null, "apc": 0.7109375}, {"name": "fig3_N8_pd2_2P+6D_rep1", "arm": "2P+6D", "n": 360, "req": 360, "e2e_p50": 1.6348335455040797, "e2e_p90": 2.1373069930952626, "e2e_p99": 3.27345111219359, "e2e_mean": 1.7528110698889374, "ttft_p90": 0.7096388279009263, "tpot_p99": 0.005832891406421778, "tps": 607.517244709636, "wall": 151.699397511009, "pu": 24.97222222222222, "du": 53.789351851851855, "apc": 0.7109375}, {"name": "fig3_N8_pd4_4P+4D_rep1", "arm": "4P+4D", "n": 360, "req": 360, "e2e_p50": 1.6989141649974044, "e2e_p90": 2.1767679123018753, "e2e_p99": 3.204218823980048, "e2e_mean": 1.7972881239612535, "ttft_p90": 0.671729751705425, "tpot_p99": 0.006599093441914785, "tps": 601.1982226696963, "wall": 153.2938663570094, "pu": 13.072916666666666, "du": 68.96527777777777, "apc": 0.7109375}, {"name": "fig3_N8_pd6_6P+2D_rep1", "arm": "6P+2D", "n": 360, "req": 360, "e2e_p50": 2.1166427994903643, "e2e_p90": 2.5305729087849618, "e2e_p99": 3.9926339721458506, "e2e_mean": 2.1701972734549297, "ttft_p90": 0.698665179402451, "tpot_p99": 0.009245334605164794, "tps": 539.9759996392945, "wall": 170.67425230299705, "pu": 7.295833333333333, "du": 92.01875, "apc": 0.7109375}] diff --git a/figs/mb5_pd_ablation/fig1_reuse_axis.png b/figs/mb5_pd_ablation/fig1_reuse_axis.png new file mode 100644 index 0000000..6588a29 Binary files /dev/null and b/figs/mb5_pd_ablation/fig1_reuse_axis.png differ diff --git a/figs/mb5_pd_ablation/fig2_shape_axis.png b/figs/mb5_pd_ablation/fig2_shape_axis.png new file mode 100644 index 0000000..42ad5d4 Binary files /dev/null and b/figs/mb5_pd_ablation/fig2_shape_axis.png differ diff --git a/figs/mb5_pd_ablation/fig3_concurrency_axis.png b/figs/mb5_pd_ablation/fig3_concurrency_axis.png new file mode 100644 index 0000000..97dcb69 Binary files /dev/null and b/figs/mb5_pd_ablation/fig3_concurrency_axis.png differ diff --git a/microbench/fresh_setup/fig_agg.py b/microbench/fresh_setup/fig_agg.py new file mode 100644 index 0000000..f8497f6 --- /dev/null +++ b/microbench/fresh_setup/fig_agg.py @@ -0,0 +1,140 @@ +"""Aggregate a set of MB5 run dirs into one comparison table. + +Pulls the three core metrics the analysis cares about, per run: + - E2E latency (from replay_metrics.summary.json: latency_stats_s) + - TPS (output tokens / wall_clock_s) + - GPU util by workers (gpu_util.csv over run_window, split prefill/decode by role) +plus honest reuse (producer-side APC from instance_apc.txt) and TTFT/TPOT for logs. + +Arm + GPU role split + producer APC ports are inferred from the dir name: + *_colo_* -> 8 kv_both ; apc ports 8000-8007 (all keep prefix) + *_pd6_* -> 6P+2D P0-5/D6-7 ; apc 8000-8005 + *_pd_* -> 4P+4D P0-3/D4-7 ; apc 8000-8003 (note: "pd" not "pd4") + *_pd2_* -> 2P+6D P0-1/D2-7 ; apc 8000-8001 + +Usage: fig_agg.py [ ...] +""" +from __future__ import annotations + +import csv +import json +import re +import statistics +import sys +from pathlib import Path + + +def arm_of(name: str): + # New driver naming (run_conc.sh / run_reuse_fixed.sh): "...__rep". + if "8C-proxy" in name: + return "colo", list(range(8)), [], list(range(8000, 8008)) + if "6P+2D" in name: + return "6P+2D", [0, 1, 2, 3, 4, 5], [6, 7], list(range(8000, 8006)) + if "2P+6D" in name: + return "2P+6D", [0, 1], [2, 3, 4, 5, 6, 7], list(range(8000, 8002)) + if "4P+4D" in name: + return "4P+4D", [0, 1, 2, 3], [4, 5, 6, 7], list(range(8000, 8004)) + # Legacy naming (original May-30 corrected runs). + if "_colo_" in name or name.endswith("_colo"): + return "colo", list(range(8)), [], list(range(8000, 8008)) + if "_pd6_" in name: + return "6P+2D", [0, 1, 2, 3, 4, 5], [6, 7], list(range(8000, 8006)) + if "_pd2_" in name: + return "2P+6D", [0, 1], [2, 3, 4, 5, 6, 7], list(range(8000, 8002)) + if "_pd4_" in name or "_pd_" in name: + return "4P+4D", [0, 1, 2, 3], [4, 5, 6, 7], list(range(8000, 8004)) + return "?", list(range(8)), [], list(range(8000, 8008)) + + +def util_split(run: Path, pgpus, dgpus): + win = {} + wp = run / "run_window.json" + if wp.exists(): + win = json.load(open(wp)) + t0, t1 = win.get("t_start_unix"), win.get("t_end_unix") + csvp = run / "gpu_util.csv" + if not csvp.exists(): + return None, None + by = {} + for row in csv.DictReader(open(csvp)): + try: + ts = float(row["timestamp"]); g = int(row["gpu"]); u = float(row["util_pct"]) + except (ValueError, KeyError): + continue + if t0 and not (t0 <= ts <= t1): + continue + by.setdefault(g, []).append(u) + pm = [v for g in pgpus for v in by.get(g, [])] + dm = [v for g in dgpus for v in by.get(g, [])] + return (statistics.fmean(pm) if pm else None, + statistics.fmean(dm) if dm else None) + + +def apc(run: Path, ports): + f = run / "instance_apc.txt" + if not f.exists(): + return None + q = h = 0 + for line in open(f): + m = dict(re.findall(r"(\w+)=(\S+)", line)) + try: + p = int(m.get("port", -1)) + except ValueError: + continue + if p in ports: + q += float(m.get("queries", 0)); h += float(m.get("hits", 0)) + return (h / q) if q else None + + +def main(): + args = sys.argv[1:] + as_json = False + if "--json" in args: + as_json = True + args = [a for a in args if a != "--json"] + rows = [] + for d in args: + run = Path(d) + sp = run / "replay_metrics.summary.json" + if not sp.exists(): + continue + s = json.load(open(sp)) + arm, pg, dg, ports = arm_of(run.name) + lat = s.get("latency_stats_s", {}) + ttft = s.get("ttft_stats_s", {}) + tpot = s.get("tpot_stats_s", {}) + wall = s.get("wall_clock_s") or 1.0 + out = s.get("actual_output_tokens_stats", {}) + n = s.get("success_count", 0); req = s.get("request_count", 0) + tot_out = out.get("count", 0) * out.get("mean", 0) + tps = tot_out / wall + pu, du = util_split(run, pg, dg) + a = apc(run, ports) + rows.append({ + "name": run.name, "arm": arm, "n": n, "req": req, + "e2e_p50": lat.get("p50"), "e2e_p90": lat.get("p90"), "e2e_p99": lat.get("p99"), + "e2e_mean": lat.get("mean"), + "ttft_p90": ttft.get("p90"), "tpot_p99": tpot.get("p99"), + "tps": tps, "wall": wall, "pu": pu, "du": du, "apc": a, + }) + + if as_json: + print(json.dumps(rows)) + return + + def f(x, w=7, p=1): + return f"{x:>{w}.{p}f}" if isinstance(x, (int, float)) else f"{'-':>{w}}" + + hdr = (f"{'run':<34}{'arm':>7}{'ok/req':>9}{'E2Ep50':>8}{'E2Ep90':>8}{'E2Ep99':>8}" + f"{'TPS':>8}{'Putil':>7}{'Dutil':>7}{'APC%':>7}{'TTFTp90':>9}{'TPOTp99ms':>10}") + print(hdr); print("-" * len(hdr)) + for r in sorted(rows, key=lambda r: r["name"]): + print(f"{r['name']:<34}{r['arm']:>7}{str(r['n'])+'/'+str(r['req']):>9}" + f"{f(r['e2e_p50'])}{f(r['e2e_p90'])}{f(r['e2e_p99'])}" + f"{f(r['tps'],8,1)}{f(r['pu'])}{f(r['du'])}" + f"{f((r['apc'] or 0)*100)}{f(r['ttft_p90'],9,2)}" + f"{f((r['tpot_p99'] or 0)*1000,10,1)}") + + +if __name__ == "__main__": + main() diff --git a/microbench/fresh_setup/gpu_util_report.py b/microbench/fresh_setup/gpu_util_report.py new file mode 100644 index 0000000..25eb3a3 --- /dev/null +++ b/microbench/fresh_setup/gpu_util_report.py @@ -0,0 +1,71 @@ +"""Per-GPU utilization report from gpu_util.csv (companion to bench_report.py). + +bench_report's per-worker GPU util needs request routing (breakdown.json), which +the MB5 proxy doesn't log. But worker == GPU by index, and the prefill/decode role +split is fixed by config, so per-GPU util from gpu_util.csv directly answers +"GPU utils by workers" — and for PD it exposes the key signal: are the prefill-side +GPUs saturated while the decode-side idles (or vice versa, or stalled at ~0)? + +Usage: + gpu_util_report.py [--prefill-gpus 0,1,2,3 --decode-gpus 4,5,6,7] +""" +from __future__ import annotations + +import argparse +import csv +import json +import statistics +from pathlib import Path + + +def pct(xs, p): + xs = sorted(xs) + return xs[max(0, min(len(xs) - 1, int(round(p / 100 * (len(xs) - 1)))))] if xs else None + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("run_dir", type=Path) + ap.add_argument("--prefill-gpus", default="") + ap.add_argument("--decode-gpus", default="") + a = ap.parse_args() + + win = {} + wp = a.run_dir / "run_window.json" + if wp.exists(): + win = json.load(open(wp)) + t0, t1 = win.get("t_start_unix"), win.get("t_end_unix") + + csvp = a.run_dir / "gpu_util.csv" + if not csvp.exists(): + print(f"{a.run_dir.name}: gpu_util.csv absent"); return + by_gpu = {} + for row in csv.DictReader(open(csvp)): + try: + ts = float(row["timestamp"]); g = int(row["gpu"]); u = float(row["util_pct"]); m = float(row["mem_used_mb"]) + except (ValueError, KeyError): + continue + if t0 and not (t0 <= ts <= t1): + continue + by_gpu.setdefault(g, {"u": [], "m": []}) + by_gpu[g]["u"].append(u); by_gpu[g]["m"].append(m) + + print(f"=== {a.run_dir.name}: per-GPU util over replay window ({sum(len(d['u']) for d in by_gpu.values())} samples) ===") + print(f"{'gpu':>4}{'util_mean':>11}{'util_p90':>10}{'util_max':>10}{'mem_max_GB':>12}") + for g in sorted(by_gpu): + u, m = by_gpu[g]["u"], by_gpu[g]["m"] + print(f"{g:>4}{statistics.fmean(u):>11.1f}{pct(u,90):>10.1f}{max(u):>10.1f}{max(m)/1024:>12.1f}") + + def agg(gpus, label): + gpus = [int(x) for x in gpus.split(",") if x != ""] + us = [v for g in gpus for v in by_gpu.get(g, {}).get("u", [])] + if us: + print(f" {label:<14} gpus={gpus} util mean={statistics.fmean(us):.1f}% p90={pct(us,90):.1f}% max={max(us):.1f}%") + if a.prefill_gpus: + agg(a.prefill_gpus, "prefill-side") + if a.decode_gpus: + agg(a.decode_gpus, "decode-side") + + +if __name__ == "__main__": + main() diff --git a/microbench/fresh_setup/mb5_run.sh b/microbench/fresh_setup/mb5_run.sh index c887f70..cbbb0ec 100755 --- a/microbench/fresh_setup/mb5_run.sh +++ b/microbench/fresh_setup/mb5_run.sh @@ -69,6 +69,13 @@ run_one() { source "${VENV}/bin/activate" local replay_out="${rundir}/replay_metrics.jsonl" mkdir -p "$(dirname "${replay_out}")" + # bench_report.py inputs: worker->gpu map (worker i == gpu i for every config; + # for PD, workers 0-3 are producers on gpu0-3, 4-7 consumers on gpu4-7). + printf '{"base_port":8000,"n_instances":8,"gpu_indices":[0,1,2,3,4,5,6,7]}\n' \ + > "${rundir}/bench_config.json" + # per-GPU utilization timeseries over the replay window (2s sampling) + bash "${SCRIPT_DIR}/gpu_monitor.sh" "${rundir}/gpu_util.csv" 2 >/dev/null 2>&1 & + local GPU_MON=$! local t0 t0=$(date +%s.%N) if ! PYTHONPATH="${FRESH_ROOT}" python -m replayer \ @@ -82,6 +89,7 @@ run_one() { t1=$(date +%s.%N) local wall=$(python -c "print(${t1} - ${t0})") echo "[mb5-run] REPLAY FAILED after ${wall} s; see ${OUT_ROOT}/${config}_rep${rep}_replay.log" + kill "${GPU_MON}" 2>/dev/null || true bash "${LAUNCH}" stop > /dev/null 2>&1 || true return 1 fi @@ -91,6 +99,9 @@ run_one() { wall_clock_s=$(python -c "print(${t1} - ${t0})") echo "[mb5-run] replay done in ${wall_clock_s}s" echo "${wall_clock_s}" > "${rundir}/wall_clock_s.txt" + kill "${GPU_MON}" 2>/dev/null || true + printf '{"t_start_unix":%s,"t_end_unix":%s}\n' "${t0}" "${t1}" > "${rundir}/run_window.json" + cp -f "${replay_out}" "${rundir}/metrics.jsonl" # bench_report.py expects metrics.jsonl # Per-instance prefix-cache counters, scraped from each backend BEFORE # teardown. For PD this is the only honest reuse signal: producer ports diff --git a/microbench/fresh_setup/partial_summary.py b/microbench/fresh_setup/partial_summary.py new file mode 100644 index 0000000..bf17361 --- /dev/null +++ b/microbench/fresh_setup/partial_summary.py @@ -0,0 +1,98 @@ +"""Compute a per-run summary directly from replay_metrics.jsonl (for partial / in-flight runs). + +Used when the replayer hasn't completed (so replay_metrics.summary.json doesn't exist +yet) but enough records have streamed to disk to read out the per-arm result. + +Also accepts a finished run's directory and prints the same one-line summary for +apples-to-apples comparison. +""" +from __future__ import annotations + +import json +import re +import statistics +import sys +from pathlib import Path + + +def stats(xs): + xs = sorted(xs) + n = len(xs) + if n == 0: + return None + return { + "n": n, + "mean": statistics.fmean(xs), + "p50": xs[n // 2], + "p90": xs[int(0.9 * (n - 1))], + "p99": xs[int(0.99 * (n - 1))], + } + + +def apc(run: Path, producer_ports): + f = run / "instance_apc.txt" + if not f.exists(): + return None + q = h = 0.0 + for line in open(f): + m = dict(re.findall(r"(\w+)=(\S+)", line)) + try: + p = int(m.get("port", -1)) + except ValueError: + continue + if p in producer_ports: + q += float(m.get("queries", 0)) + h += float(m.get("hits", 0)) + return (h / q) if q else None + + +def main(): + for d in sys.argv[1:]: + run = Path(d) + # prefer the live replay_metrics.jsonl (so partials work); fall back to metrics.jsonl + for fn in ("replay_metrics.partial.jsonl", "replay_metrics.jsonl", "metrics.jsonl"): + p = run / fn + if p.exists(): + rec_path = p + break + else: + print(f"{run.name}: no records"); continue + recs = [json.loads(l) for l in open(rec_path)] + oks = [r for r in recs if r.get("error") is None] + lat = stats([r["latency_s"] for r in oks if "latency_s" in r]) + ttft = stats([r["ttft_s"] for r in oks if "ttft_s" in r]) + tpot = stats([r["tpot_s"] for r in oks if "tpot_s" in r]) + out = sum(r.get("actual_output_tokens", r.get("output_length", 0)) for r in oks) + ts = [r["t_dispatch_unix"] for r in oks if "t_dispatch_unix" in r] + tf = [r["t_finish_unix"] for r in oks if "t_finish_unix" in r] + span = max(tf) - min(ts) if ts and tf else 0 + tps = out / span if span else 0 + + # producer ports by arm tag in dirname + n = run.name + if "_colo_" in n: + ports = list(range(8000, 8008)) + elif "_pd6_" in n: + ports = list(range(8000, 8006)) + elif "_pd2_" in n: + ports = list(range(8000, 8002)) + else: + ports = list(range(8000, 8004)) + a = apc(run, ports) + + print(f"{run.name}") + print(f" n_ok={len(oks)}/{len(recs)}" + + (f" (target=1214 -> {len(oks)*100/1214:.1f}%)" if len(recs) < 1214 else "")) + if lat: + print(f" E2E mean={lat['mean']:.2f} p50={lat['p50']:.2f} p90={lat['p90']:.2f} p99={lat['p99']:.2f}") + if ttft: + print(f" TTFT mean={ttft['mean']:.2f} p50={ttft['p50']:.2f} p90={ttft['p90']:.2f} p99={ttft['p99']:.2f}") + if tpot: + print(f" TPOT mean={tpot['mean']*1000:.1f}ms p90={tpot['p90']*1000:.1f}ms p99={tpot['p99']*1000:.1f}ms") + print(f" output_tokens={out:.0f} span={span:.0f}s TPS={tps:.0f}") + if a is not None: + print(f" producer APC={a*100:.1f}%") + + +if __name__ == "__main__": + main() diff --git a/replayer/__main__.py b/replayer/__main__.py index 105f1f4..258ee34 100644 --- a/replayer/__main__.py +++ b/replayer/__main__.py @@ -30,12 +30,23 @@ def main() -> None: default=float(_env_think) if _env_think else None, help="Closed-loop think-time (s) after each turn completes; " "ignore absolute trace schedule. Env: REPLAY_INTER_TURN_THINK_S") + p.add_argument("--no-realized-prefix", + action="store_true", + default=bool(os.environ.get("REPLAY_NO_REALIZED_PREFIX")), + help="Controlled-reuse mode: prompt = hash-built tokens only " + "(reuse set by hash_ids). Env: REPLAY_NO_REALIZED_PREFIX") p.add_argument("--dispatch-mode", choices=["tracets", "thinktime"], default=os.environ.get("REPLAY_DISPATCH_MODE", "tracets"), help="tracets (Mode 1): absolute trace ts = max(prev_finished, ts). " "thinktime (Mode 2): turn-k at prev_finished + " "time_to_parent_chat. Env: REPLAY_DISPATCH_MODE") p.add_argument("--request-timeout", type=float, default=600.0) + _env_maxdur = os.environ.get("REPLAY_MAX_DURATION") + p.add_argument("--max-duration", type=float, + default=float(_env_maxdur) if _env_maxdur else None, + help="Overall wall-clock deadline (s): cancel in-flight + write " + "summary (un-run turns counted as failures) to bound a " + "collapsed config's drain. Env: REPLAY_MAX_DURATION") p.add_argument("--request-limit", type=int, default=None, help="Limit number of requests to replay") p.add_argument("-v", "--verbose", action="store_true") @@ -56,7 +67,9 @@ def main() -> None: request_limit=args.request_limit, max_inflight_sessions=args.max_inflight_sessions, inter_turn_think_s=args.inter_turn_think, + no_realized_prefix=args.no_realized_prefix, dispatch_mode=args.dispatch_mode, + max_duration_s=args.max_duration, ) results = asyncio.run(replay_trace(config)) diff --git a/replayer/replay.py b/replayer/replay.py index 0d55441..b2832eb 100644 --- a/replayer/replay.py +++ b/replayer/replay.py @@ -66,6 +66,13 @@ class ReplayConfig: # max_inflight_sessions=N this is a stable N-user closed-loop (no open-loop # runaway), so it removes the "immediate retrigger under load" artifact. inter_turn_think_s: float | None = None + # Controlled-reuse mode: skip _apply_realized_prefix so each turn's prompt is + # exactly the hash-built tokens. Then prefix-cache reuse is governed solely by + # the generated hash_ids (shared prefix blocks hit, fresh delta blocks miss) — + # required for the reuse-fraction sweep, where realized-prefix would otherwise + # force every fixed-length turn to ≈ the prior turn (≈100% reuse regardless). + # Keep OFF (realized-prefix ON) for the real agentic trace. + no_realized_prefix: bool = False # Dispatch timing for intra-session turns: # "tracets" (Mode 1): fire at absolute trace timestamp -> effectively # max(prev_finished, trace_ts); collapses think-time to 0 when @@ -73,6 +80,25 @@ class ReplayConfig: # "thinktime" (Mode 2): turn-1 at trace arrival; turn-k at # prev_finished + time_to_parent_chat (real production gap). dispatch_mode: str = "tracets" + # Overall wall-clock deadline for the whole replay (seconds). When exceeded, + # stop awaiting in-flight sessions, cancel them, and write the summary over + # whatever completed — un-run turns are counted as failures so completion% + # stays honest (request_count == full trace). None = no deadline (default, + # original behavior unchanged). Used to bound the slow drain of a collapsed + # config in a sweep. Env: REPLAY_MAX_DURATION. + max_duration_s: float | None = None + + +def _skipped_metric() -> "RequestMetrics": + """Placeholder failure row for a turn never run due to a max_duration cutoff. + Only its error (non-None) matters: it counts toward request/error totals but + is excluded from latency/ttft/tpot percentiles (successes only).""" + return RequestMetrics( + request_id="deadline_skipped", session_id="", turn_id=-1, + trace_timestamp_s=0.0, input_length=0, output_length=0, + request_type="skipped", effective_input_length=None, cached_tokens=0, + latency_s=None, ttft_s=None, tpot_s=None, error="deadline_skipped", + ) def _build_prompt_token_ids(req: TraceRequest) -> list[int]: @@ -318,10 +344,9 @@ async def _run_session( if elapsed < target_wall: await asyncio.sleep(target_wall - elapsed) - token_ids = _apply_realized_prefix( - _build_prompt_token_ids(req), - realized_context, - ) + token_ids = _build_prompt_token_ids(req) + if not config.no_realized_prefix: + token_ids = _apply_realized_prefix(token_ids, realized_context) result = await _dispatch_request( client=client, config=config, req=req, prompt_token_ids=token_ids, sem=request_sem, @@ -410,25 +435,44 @@ async def replay_trace(config: ReplayConfig) -> list[RequestMetrics]: trust_env=False, limits=limits, ) as client: + states = [_SessionState(session_id=sid, turns=turns) + for sid, turns in sessions] tasks = [ asyncio.create_task(_run_session( - state=_SessionState(session_id=sid, turns=turns), - config=config, client=client, + state=st, config=config, client=client, request_sem=request_sem, earliest_ts=earliest_ts, sweep_start=sweep_start, sink=sink, session_sem=session_sem, )) - for sid, turns in sessions + for st in states ] - all_results = await asyncio.gather(*tasks) + if config.max_duration_s and config.max_duration_s > 0: + _done, pending = await asyncio.wait( + tasks, timeout=config.max_duration_s) + if pending: + logger.warning( + "max_duration %.0fs reached: cancelling %d in-flight " + "session(s); un-run turns counted as failures", + config.max_duration_s, len(pending)) + for t in pending: + t.cancel() + await asyncio.gather(*pending, return_exceptions=True) + else: + await asyncio.gather(*tasks) finally: sink.close() sweep_elapsed = time.perf_counter() - sweep_start post_metrics = await _snapshot_prefix_cache_metrics(config.endpoint_url) - flat = [m for group in all_results for m in group] + # Build from the session states (identical to the gather return in the + # uncapped path) so partially-completed (cancelled) sessions still contribute + # their finished turns; pad un-run turns as failures so request_count == trace. + flat = [m for st in states for m in st.metrics] + missing = n_requests - len(flat) + if missing > 0: + flat.extend(_skipped_metric() for _ in range(missing)) summary_path = config.output_path.with_suffix(".summary.json") write_summary_json(summary_path, flat)