Fix trace reuse and packaged model assets

This commit is contained in:
2026-05-08 21:24:29 +08:00
parent 152f01613b
commit 5a6b1acb49
6 changed files with 210 additions and 37 deletions

View File

@@ -14,6 +14,9 @@ from .reporting import write_reports
from .resume_advanced import collect_existing_detail_paths, run_advanced_from_existing
from .study import parse_input_length_bucket_thresholds, run_study
ANALYSIS_PROVENANCE_FILENAME = "analysis_provenance.json"
ANALYSIS_PROVENANCE_SCHEMA_VERSION = 1
def build_parser():
parser = argparse.ArgumentParser(description="Analyze coding-agent trace patterns.")
@@ -282,6 +285,70 @@ def _existing_detail_outputs(output_dir):
return collect_existing_detail_paths(output_dir)
def _file_fingerprint(path: str | Path) -> dict:
resolved = Path(path).resolve()
stat = resolved.stat()
return {
"path": str(resolved),
"size": stat.st_size,
"mtime_ns": stat.st_mtime_ns,
}
def _path_option(value: str | None) -> str | None:
if value is None:
return None
path = Path(value)
return str(path.resolve()) if path.exists() else str(value)
def _expected_analysis_provenance(args, release_input_path: Path, input_length_bucket_thresholds: list[int]) -> dict:
return {
"schema_version": ANALYSIS_PROVENANCE_SCHEMA_VERSION,
"raw_input": _file_fingerprint(args.input),
"release_input": _file_fingerprint(release_input_path),
"options": {
"block_size": args.block_size,
"segment_mode": args.segment_mode,
"tokenizer_path": _path_option(args.tokenizer_path),
"tokenizer_batch_size": args.tokenizer_batch_size,
"model_family": args.model_family,
"model_meta_dir": _path_option(args.model_meta_dir),
"input_length_buckets": input_length_bucket_thresholds,
},
}
def _analysis_provenance_path(output_dir: Path) -> Path:
return output_dir / ANALYSIS_PROVENANCE_FILENAME
def _load_analysis_provenance(output_dir: Path) -> dict | None:
path = _analysis_provenance_path(output_dir)
if not path.exists():
return None
try:
with path.open("r", encoding="utf-8") as handle:
data = json.load(handle)
except (OSError, json.JSONDecodeError):
return None
return data if isinstance(data, dict) else None
def _analysis_provenance_matches(output_dir: Path, expected: dict) -> bool:
return _load_analysis_provenance(output_dir) == expected
def _write_analysis_provenance(output_dir: Path, provenance: dict) -> None:
output_dir.mkdir(parents=True, exist_ok=True)
destination = _analysis_provenance_path(output_dir)
temp_path = destination.with_suffix(destination.suffix + ".tmp")
with temp_path.open("w", encoding="utf-8") as handle:
json.dump(provenance, handle, ensure_ascii=False, indent=2, sort_keys=True)
handle.write("\n")
temp_path.replace(destination)
def _stage_message(progress, step: int, total_steps: int, message: str) -> None:
tqdm.write(f"Stage {step}/{total_steps}: {message}")
progress.update(1)
@@ -301,6 +368,8 @@ def main(argv=None):
f"Release trace not found for raw trace {args.input}. "
"Run `python -m trace_formatter build-release <raw-trace>` first, or pass --release-input."
)
expected_provenance = _expected_analysis_provenance(args, release_input_path, input_length_bucket_thresholds)
reuse_allowed = _analysis_provenance_matches(output_dir, expected_provenance)
total_steps = 4
progress = tqdm(
total=total_steps,
@@ -310,7 +379,7 @@ def main(argv=None):
)
try:
prepare_result = None
reusable_base = _existing_base_outputs(output_dir)
reusable_base = _existing_base_outputs(output_dir) if reuse_allowed else None
if reusable_base:
_stage_message(progress, 1, total_steps, "reuse existing features.csv")
prepare_result = {
@@ -320,7 +389,7 @@ def main(argv=None):
else:
_stage_message(progress, 1, total_steps, "prepare features.csv")
prepare_result = stream_prepare(args.input, output_dir, show_progress=True)
reusable_details = _existing_detail_outputs(output_dir)
reusable_details = _existing_detail_outputs(output_dir) if reuse_allowed else None
if reusable_details:
_stage_message(progress, 2, total_steps, "reuse existing details/")
advanced_paths = reusable_details
@@ -362,6 +431,7 @@ def main(argv=None):
dataset_title=dataset_name,
show_progress=True,
)
_write_analysis_provenance(output_dir, expected_provenance)
finally:
progress.close()
print(