110 lines
3.2 KiB
Rust
110 lines
3.2 KiB
Rust
//! Streaming JSONL reader for the qwen-bailian trace format.
|
|
//!
|
|
//! Schema (per upstream README):
|
|
//! chat_id: i64
|
|
//! parent_chat_id: i64 (-1 = root)
|
|
//! timestamp: f64 (seconds since trace start)
|
|
//! input_length: i64
|
|
//! output_length: i64
|
|
//! type: string (text/search/image/file)
|
|
//! turn: i64
|
|
//! hash_ids: [i64] (16-token blocks, salted SipHash)
|
|
|
|
use anyhow::{Context, Result};
|
|
use serde::Deserialize;
|
|
use std::fs::File;
|
|
use std::io::{BufRead, BufReader};
|
|
use std::path::Path;
|
|
|
|
#[derive(Debug, Clone, Deserialize)]
|
|
struct RawRecord {
|
|
#[serde(default)]
|
|
chat_id: i64,
|
|
#[serde(default)]
|
|
parent_chat_id: i64,
|
|
#[serde(default)]
|
|
timestamp: f64,
|
|
#[serde(default)]
|
|
input_length: i64,
|
|
#[serde(default)]
|
|
output_length: i64,
|
|
#[serde(default)]
|
|
turn: i64,
|
|
#[serde(default)]
|
|
hash_ids: Vec<i64>,
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct RequestRecord {
|
|
pub req_id: u64,
|
|
pub chat_id: i64,
|
|
pub parent_chat_id: i64,
|
|
pub turn: i64,
|
|
pub arrival: f64,
|
|
pub input_len: u32,
|
|
pub output_len: u32,
|
|
pub hash_ids: Vec<u64>,
|
|
}
|
|
|
|
pub struct TraceReader {
|
|
inner: BufReader<File>,
|
|
next_id: u64,
|
|
line_buf: String,
|
|
max_requests: Option<u64>,
|
|
}
|
|
|
|
impl TraceReader {
|
|
pub fn open<P: AsRef<Path>>(path: P, max_requests: Option<u64>) -> Result<Self> {
|
|
let path = path.as_ref();
|
|
let f = File::open(path).with_context(|| format!("opening trace {}", path.display()))?;
|
|
Ok(Self {
|
|
inner: BufReader::with_capacity(1 << 20, f),
|
|
next_id: 0,
|
|
line_buf: String::with_capacity(4096),
|
|
max_requests,
|
|
})
|
|
}
|
|
}
|
|
|
|
impl Iterator for TraceReader {
|
|
type Item = Result<RequestRecord>;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
if let Some(cap) = self.max_requests {
|
|
if self.next_id >= cap {
|
|
return None;
|
|
}
|
|
}
|
|
loop {
|
|
self.line_buf.clear();
|
|
match self.inner.read_line(&mut self.line_buf) {
|
|
Ok(0) => return None,
|
|
Ok(_) => {
|
|
let trimmed = self.line_buf.trim();
|
|
if trimmed.is_empty() {
|
|
continue;
|
|
}
|
|
let parsed: Result<RawRecord, _> = serde_json::from_str(trimmed);
|
|
let raw = match parsed {
|
|
Ok(r) => r,
|
|
Err(e) => return Some(Err(anyhow::anyhow!("trace parse: {e}"))),
|
|
};
|
|
let id = self.next_id;
|
|
self.next_id += 1;
|
|
return Some(Ok(RequestRecord {
|
|
req_id: id,
|
|
chat_id: raw.chat_id,
|
|
parent_chat_id: raw.parent_chat_id,
|
|
turn: raw.turn,
|
|
arrival: raw.timestamp,
|
|
input_len: raw.input_length.max(0) as u32,
|
|
output_len: raw.output_length.max(0) as u32,
|
|
hash_ids: raw.hash_ids.into_iter().map(|h| h as u64).collect(),
|
|
}));
|
|
}
|
|
Err(e) => return Some(Err(e.into())),
|
|
}
|
|
}
|
|
}
|
|
}
|