//! Streaming JSONL reader for the qwen-bailian trace format. //! //! Schema (per upstream README): //! chat_id: i64 //! parent_chat_id: i64 (-1 = root) //! timestamp: f64 (seconds since trace start) //! input_length: i64 //! output_length: i64 //! type: string (text/search/image/file) //! turn: i64 //! hash_ids: [i64] (16-token blocks, salted SipHash) use anyhow::{Context, Result}; use serde::Deserialize; use std::fs::File; use std::io::{BufRead, BufReader}; use std::path::Path; #[derive(Debug, Clone, Deserialize)] struct RawRecord { #[serde(default)] chat_id: i64, #[serde(default)] parent_chat_id: i64, #[serde(default)] timestamp: f64, #[serde(default)] input_length: i64, #[serde(default)] output_length: i64, #[serde(default)] turn: i64, #[serde(default)] hash_ids: Vec, } #[derive(Debug, Clone)] pub struct RequestRecord { pub req_id: u64, pub chat_id: i64, pub parent_chat_id: i64, pub turn: i64, pub arrival: f64, pub input_len: u32, pub output_len: u32, pub hash_ids: Vec, } pub struct TraceReader { inner: BufReader, next_id: u64, line_buf: String, max_requests: Option, } impl TraceReader { pub fn open>(path: P, max_requests: Option) -> Result { let path = path.as_ref(); let f = File::open(path).with_context(|| format!("opening trace {}", path.display()))?; Ok(Self { inner: BufReader::with_capacity(1 << 20, f), next_id: 0, line_buf: String::with_capacity(4096), max_requests, }) } } impl Iterator for TraceReader { type Item = Result; fn next(&mut self) -> Option { if let Some(cap) = self.max_requests { if self.next_id >= cap { return None; } } loop { self.line_buf.clear(); match self.inner.read_line(&mut self.line_buf) { Ok(0) => return None, Ok(_) => { let trimmed = self.line_buf.trim(); if trimmed.is_empty() { continue; } let parsed: Result = serde_json::from_str(trimmed); let raw = match parsed { Ok(r) => r, Err(e) => return Some(Err(anyhow::anyhow!("trace parse: {e}"))), }; let id = self.next_id; self.next_id += 1; return Some(Ok(RequestRecord { req_id: id, chat_id: raw.chat_id, parent_chat_id: raw.parent_chat_id, turn: raw.turn, arrival: raw.timestamp, input_len: raw.input_length.max(0) as u32, output_len: raw.output_length.max(0) as u32, hash_ids: raw.hash_ids.into_iter().map(|h| h as u64).collect(), })); } Err(e) => return Some(Err(e.into())), } } } }