Fix trace reuse and packaged model assets
This commit is contained in:
@@ -14,6 +14,9 @@ from .reporting import write_reports
|
||||
from .resume_advanced import collect_existing_detail_paths, run_advanced_from_existing
|
||||
from .study import parse_input_length_bucket_thresholds, run_study
|
||||
|
||||
ANALYSIS_PROVENANCE_FILENAME = "analysis_provenance.json"
|
||||
ANALYSIS_PROVENANCE_SCHEMA_VERSION = 1
|
||||
|
||||
|
||||
def build_parser():
|
||||
parser = argparse.ArgumentParser(description="Analyze coding-agent trace patterns.")
|
||||
@@ -282,6 +285,70 @@ def _existing_detail_outputs(output_dir):
|
||||
return collect_existing_detail_paths(output_dir)
|
||||
|
||||
|
||||
def _file_fingerprint(path: str | Path) -> dict:
|
||||
resolved = Path(path).resolve()
|
||||
stat = resolved.stat()
|
||||
return {
|
||||
"path": str(resolved),
|
||||
"size": stat.st_size,
|
||||
"mtime_ns": stat.st_mtime_ns,
|
||||
}
|
||||
|
||||
|
||||
def _path_option(value: str | None) -> str | None:
|
||||
if value is None:
|
||||
return None
|
||||
path = Path(value)
|
||||
return str(path.resolve()) if path.exists() else str(value)
|
||||
|
||||
|
||||
def _expected_analysis_provenance(args, release_input_path: Path, input_length_bucket_thresholds: list[int]) -> dict:
|
||||
return {
|
||||
"schema_version": ANALYSIS_PROVENANCE_SCHEMA_VERSION,
|
||||
"raw_input": _file_fingerprint(args.input),
|
||||
"release_input": _file_fingerprint(release_input_path),
|
||||
"options": {
|
||||
"block_size": args.block_size,
|
||||
"segment_mode": args.segment_mode,
|
||||
"tokenizer_path": _path_option(args.tokenizer_path),
|
||||
"tokenizer_batch_size": args.tokenizer_batch_size,
|
||||
"model_family": args.model_family,
|
||||
"model_meta_dir": _path_option(args.model_meta_dir),
|
||||
"input_length_buckets": input_length_bucket_thresholds,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _analysis_provenance_path(output_dir: Path) -> Path:
|
||||
return output_dir / ANALYSIS_PROVENANCE_FILENAME
|
||||
|
||||
|
||||
def _load_analysis_provenance(output_dir: Path) -> dict | None:
|
||||
path = _analysis_provenance_path(output_dir)
|
||||
if not path.exists():
|
||||
return None
|
||||
try:
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
data = json.load(handle)
|
||||
except (OSError, json.JSONDecodeError):
|
||||
return None
|
||||
return data if isinstance(data, dict) else None
|
||||
|
||||
|
||||
def _analysis_provenance_matches(output_dir: Path, expected: dict) -> bool:
|
||||
return _load_analysis_provenance(output_dir) == expected
|
||||
|
||||
|
||||
def _write_analysis_provenance(output_dir: Path, provenance: dict) -> None:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
destination = _analysis_provenance_path(output_dir)
|
||||
temp_path = destination.with_suffix(destination.suffix + ".tmp")
|
||||
with temp_path.open("w", encoding="utf-8") as handle:
|
||||
json.dump(provenance, handle, ensure_ascii=False, indent=2, sort_keys=True)
|
||||
handle.write("\n")
|
||||
temp_path.replace(destination)
|
||||
|
||||
|
||||
def _stage_message(progress, step: int, total_steps: int, message: str) -> None:
|
||||
tqdm.write(f"Stage {step}/{total_steps}: {message}")
|
||||
progress.update(1)
|
||||
@@ -301,6 +368,8 @@ def main(argv=None):
|
||||
f"Release trace not found for raw trace {args.input}. "
|
||||
"Run `python -m trace_formatter build-release <raw-trace>` first, or pass --release-input."
|
||||
)
|
||||
expected_provenance = _expected_analysis_provenance(args, release_input_path, input_length_bucket_thresholds)
|
||||
reuse_allowed = _analysis_provenance_matches(output_dir, expected_provenance)
|
||||
total_steps = 4
|
||||
progress = tqdm(
|
||||
total=total_steps,
|
||||
@@ -310,7 +379,7 @@ def main(argv=None):
|
||||
)
|
||||
try:
|
||||
prepare_result = None
|
||||
reusable_base = _existing_base_outputs(output_dir)
|
||||
reusable_base = _existing_base_outputs(output_dir) if reuse_allowed else None
|
||||
if reusable_base:
|
||||
_stage_message(progress, 1, total_steps, "reuse existing features.csv")
|
||||
prepare_result = {
|
||||
@@ -320,7 +389,7 @@ def main(argv=None):
|
||||
else:
|
||||
_stage_message(progress, 1, total_steps, "prepare features.csv")
|
||||
prepare_result = stream_prepare(args.input, output_dir, show_progress=True)
|
||||
reusable_details = _existing_detail_outputs(output_dir)
|
||||
reusable_details = _existing_detail_outputs(output_dir) if reuse_allowed else None
|
||||
if reusable_details:
|
||||
_stage_message(progress, 2, total_steps, "reuse existing details/")
|
||||
advanced_paths = reusable_details
|
||||
@@ -362,6 +431,7 @@ def main(argv=None):
|
||||
dataset_title=dataset_name,
|
||||
show_progress=True,
|
||||
)
|
||||
_write_analysis_provenance(output_dir, expected_provenance)
|
||||
finally:
|
||||
progress.close()
|
||||
print(
|
||||
|
||||
Reference in New Issue
Block a user