#!/usr/bin/env python3 """Test concurrent requests to verify continuous batching scheduling.""" import json import time import urllib.request import concurrent.futures URL = "http://localhost:9090/v1/chat/completions" PROMPTS = [ "What is 1+1?", "What is 2+2?", "What is 3+3?", "What is 4+4?", "What is 5+5?", "What is 6+6?", "What is 7+7?", "What is 8+8?", ] def send_request(prompt, idx): body = json.dumps({ "model": "qwen3-8b", "messages": [{"role": "user", "content": prompt}], "max_tokens": 32, "temperature": 0.0, }).encode() req = urllib.request.Request(URL, data=body, headers={"Content-Type": "application/json"}) t0 = time.perf_counter() resp = urllib.request.urlopen(req, timeout=120) elapsed = time.perf_counter() - t0 data = json.loads(resp.read()) content = data["choices"][0]["message"]["content"][:50].replace('\n', ' ') ct = data["usage"]["completion_tokens"] return idx, prompt, elapsed, ct, content def main(): print("=== Concurrent request test (8 requests, max_batch=4) ===\n") # Fire all 8 requests concurrently t_start = time.perf_counter() with concurrent.futures.ThreadPoolExecutor(max_workers=8) as pool: futures = [pool.submit(send_request, p, i) for i, p in enumerate(PROMPTS)] results = [f.result() for f in concurrent.futures.as_completed(futures)] t_total = time.perf_counter() - t_start results.sort(key=lambda r: r[0]) total_tokens = 0 for idx, prompt, elapsed, ct, content in results: total_tokens += ct print(f" [{idx}] {elapsed:5.2f}s | ct={ct:2d} | {prompt} -> {content}...") serial_estimate = sum(r[2] for r in results) print(f"\n Wall clock: {t_total:.2f}s") print(f" Sum of individual latencies: {serial_estimate:.2f}s") print(f" Concurrency speedup: {serial_estimate/t_total:.2f}x (1.0x = no batching)") print(f" Total tokens: {total_tokens}") print(f" Throughput: {total_tokens/t_total:.1f} tok/s") if t_total < serial_estimate * 0.85: print(f"\n Concurrent scheduling is working (wall < 85% of serial sum)") else: print(f"\n Limited concurrency benefit (scheduling correct, GPU still per-seq)") if __name__ == "__main__": main()