#!/usr/bin/env python3
"""Verify notebook sessions and check for untracked IO."""
from __future__ import annotations
import json
import re
from pathlib import Path
from typing import Dict, List, Union
from ._parse import get_code_cells
# Patterns for scitex.io calls
_IO_LOAD_RE = re.compile(r"(?:scitex|stx)\.io\.load\s*\(")
_IO_SAVE_RE = re.compile(r"(?:scitex|stx)\.io\.save\s*\(")
_SESSION_RE = re.compile(r"@(?:scitex|stx)\.session")
[docs]
def verify_notebook(
path: Union[str, Path],
*,
db=None,
verify_run_fn=None,
) -> List[Dict]:
"""Verify all clew sessions associated with a notebook.
Finds all runs in the clew DB whose metadata contains this notebook's
path, then runs L1 (cache) verification on each.
Parameters
----------
path : str or Path
Path to the .ipynb file.
db : optional
Pre-resolved clew DB handle. If omitted, the default DB from
``scitex_clew.get_db()`` is used. Exposed for testability so
tests can inject a hand-rolled fake without patching internals.
verify_run_fn : optional
Callable ``(session_id) -> Verification``. If omitted,
``scitex_clew.verify_run`` is used. Exposed for testability.
Returns
-------
list of dict
Verification results per session.
"""
if db is None or verify_run_fn is None:
from scitex_clew import get_db, verify_run
if db is None:
db = get_db()
if verify_run_fn is None:
verify_run_fn = verify_run
path = Path(path).resolve()
runs = _get_runs_for_notebook(db, str(path))
results = []
for run in runs:
try:
verification = verify_run_fn(run["session_id"])
results.append(
{
"session_id": run["session_id"],
"status": verification.status.value,
"is_verified": verification.is_verified,
"started_at": run.get("started_at"),
}
)
except Exception as exc:
results.append(
{
"session_id": run["session_id"],
"status": "error",
"is_verified": False,
"error": str(exc),
}
)
return results
[docs]
def check_notebook(path: Union[str, Path]) -> List[Dict]:
"""Find cells with scitex.io calls not wrapped in @scitex.session.
Parameters
----------
path : str or Path
Path to the .ipynb file.
Returns
-------
list of dict
Cells with untracked IO: {index, has_load, has_save, has_session}.
"""
cells = get_code_cells(path)
issues = []
for cell in cells:
source = cell["source"]
has_load = bool(_IO_LOAD_RE.search(source))
has_save = bool(_IO_SAVE_RE.search(source))
has_session = bool(_SESSION_RE.search(source))
if (has_load or has_save) and not has_session:
issues.append(
{
"index": cell["index"],
"has_load": has_load,
"has_save": has_save,
"has_session": has_session,
}
)
return issues
def _get_runs_for_notebook(db, notebook_path: str) -> List[Dict]:
"""Query clew DB for runs associated with a notebook path."""
runs = db.list_runs(limit=1000)
result = []
for run in runs:
meta_str = run.get("metadata")
if not meta_str:
# Also check script_path for notebook path
sp = run.get("script_path", "")
if sp and sp.endswith(".ipynb"):
if str(Path(sp).resolve()) == notebook_path:
result.append(run)
continue
try:
meta = json.loads(meta_str)
nb_path = meta.get("notebook_path")
if nb_path and str(Path(nb_path).resolve()) == notebook_path:
result.append(run)
except (json.JSONDecodeError, TypeError):
continue
return sorted(result, key=lambda r: r.get("started_at", ""))
# EOF