Source code for scitex_notebook._magic

#!/usr/bin/env python3
# Timestamp: "2026-05-02 (ywatanabe)"
# File: src/scitex_notebook/_magic.py
"""IPython extension that detects notebook reproducibility anti-patterns.

Adoption is one line at the top of any notebook::

    %load_ext scitex_notebook

After that, each executed cell is analysed at runtime and recorded in the
same Clew SQLite database used by ``@scitex.session`` and ``stx.io``. The
extension targets three of the canonical reproducibility risks for
notebooks identified in the literature:

1. **Hidden-state leak** (Pimentel 2019, Samuel 2024) — a cell reads a name
   that was *not* defined by any previously executed cell in this run, so
   it silently depends on prior kernel state. The extension AST-scans each
   cell, tracks ``(name → defining cell)``, and emits a runtime warning
   the first time a leaky name is loaded.
2. **Out-of-order execution** (Pimentel 2019, Wang 2020) — IPython's
   ``execution_count`` is recorded per cell so a post-run check can verify
   monotonic execution against the on-disk cell order.
3. **Untracked I/O** — every ``stx.io.save/load`` call inside the cell
   attaches to that cell's :class:`SessionTracker` via :func:`set_tracker`,
   so file hashes enter the per-cell record automatically.

The extension also rewrites the ``parent_session`` link to encode the real
data-dependency edge: ``parent`` becomes the most recent earlier cell whose
*defined* names overlap with this cell's *loaded* names. If the cell has no
intra-run dependency, ``parent`` is ``None`` (the cell is independent),
which is itself useful information.

Caveats
-------
* Hidden-state detection is name-level, not value-level — we do not
  snapshot the kernel namespace. A cell that mutates ``df`` in place is
  visible only through the cell that *defined* ``df``.
* AST analysis is a best-effort static approximation: cells using
  ``exec``/``eval``, dynamic ``globals()`` access, or IPython line-magics
  may produce false negatives.
* The extension never edits user code. All findings are advisory warnings
  in stderr plus structured metadata in the Clew DB.
"""

from __future__ import annotations

import ast
import builtins
import hashlib
import os
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional

# scitex_clew is a hard dependency: this magic is meaningless without it.
try:
    from scitex_clew._tracker import SessionTracker, set_tracker
except ImportError as e:  # pragma: no cover
    raise ImportError(
        "scitex_notebook._magic requires scitex-clew. "
        "Install with `pip install scitex-clew`."
    ) from e


# Python builtins resolve "for free" in any cell — never flag them as
# leaky. Includes things like ``print``, ``len``, ``True``, etc.
_BUILTIN_NAMES: frozenset[str] = frozenset(dir(builtins))

# IPython injects a few names into ``user_ns`` itself; treat them as
# always-available so we don't flood with false positives.
_IPYTHON_NAMES: frozenset[str] = frozenset(
    {
        "In",
        "Out",
        "_",
        "__",
        "___",
        "_i",
        "_ii",
        "_iii",
        "exit",
        "quit",
        "get_ipython",
    }
)

_AMBIENT_NAMES: frozenset[str] = _BUILTIN_NAMES | _IPYTHON_NAMES


def _short_id(text: str, n: int = 8) -> str:
    return hashlib.sha256(text.encode("utf-8")).hexdigest()[:n]


def _ast_loads_and_stores(src: str) -> tuple[set[str], set[str]]:
    """Return ``(loaded_names, stored_names)`` from a single cell.

    Conservative best-effort: returns empty sets on syntax error (e.g. the
    cell is an IPython line-magic like ``%timeit``). ``stored_names``
    includes names introduced by assignment, function/class definitions,
    and imports. ``loaded_names`` contains every ``ast.Name`` with a
    ``Load`` context, with builtins/IPython ambient names filtered out.
    """
    try:
        tree = ast.parse(src)
    except SyntaxError:
        return set(), set()

    loads: set[str] = set()
    stores: set[str] = set()
    for node in ast.walk(tree):
        if isinstance(node, ast.Name):
            if isinstance(node.ctx, ast.Load):
                loads.add(node.id)
            elif isinstance(node.ctx, (ast.Store, ast.Del)):
                stores.add(node.id)
        elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
            stores.add(node.name)
        elif isinstance(node, ast.Import):
            for n in node.names:
                stores.add((n.asname or n.name).split(".")[0])
        elif isinstance(node, ast.ImportFrom):
            for n in node.names:
                stores.add(n.asname or n.name)
        elif isinstance(node, ast.Global):
            stores.update(node.names)

    loads -= _AMBIENT_NAMES
    return loads, stores


def _detect_notebook_path(shell) -> Optional[Path]:
    """Best-effort detection of the running notebook's path.

    Order of attempts:

    1. ``scitex.gen.get_notebook_path()`` if importable.
    2. ``ipykernel`` + Jupyter server REST query (existing helper).
    3. ``IPYTHONDIR`` / fallback to a synthetic path under ``$PWD``.

    Returns ``None`` if no path can be resolved (we still track, just
    without a stable notebook handle).
    """
    try:
        from scitex_gen import get_notebook_path

        nb = get_notebook_path()
        if nb:
            return Path(nb)
    except Exception:
        pass
    # Last-resort fallback: synthesize a stable name from the shell session.
    if shell is not None:
        sess = getattr(shell, "history_manager", None)
        if sess is not None:
            sess_id = getattr(sess, "session_number", 0)
            return Path(os.getcwd()) / f"untitled-session-{sess_id}.ipynb"
    return None


class ScitexNotebookMagics:
    """Lightweight non-Magics IPython hook installer.

    We don't subclass ``IPython.core.magic.Magics`` because we don't expose
    user-callable ``%line`` or ``%%cell`` magics — the entire feature is
    automatic per-cell instrumentation triggered by ``%load_ext``.
    """

    def __init__(self, shell):
        self.shell = shell
        self.notebook_path = _detect_notebook_path(shell)
        self._exec_index = 0
        self._prev_session: Optional[str] = None
        # Per-cell scratch state populated in pre and consumed in post.
        self._tracker: Optional[SessionTracker] = None
        self._cell_t0: Optional[float] = None
        self._cell_src_hash: Optional[str] = None
        self._cell_loads: set[str] = set()
        self._cell_stores: set[str] = set()
        self._cell_warnings: list[dict[str, Any]] = []
        self._cell_dep_parents: list[str] = []

        # Per-run state for hidden-state-leak / dependency-DAG detection.
        # Maps each defined name to the (cell_index, session_id) that first
        # introduced it. Most-recent definition is recorded so re-defining
        # in a later cell shifts the dependency edge.
        self._name_defs: dict[str, tuple[int, str]] = {}

        # Aggregate warnings for post-run summary (visible to the cohort
        # runner via ``magic.warnings_summary()``).
        self.warnings: list[dict[str, Any]] = []

        # Stable per-notebook session-id prefix
        nb_stem = self.notebook_path.stem if self.notebook_path else "untitled-notebook"
        run_tag = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S")
        self._sid_prefix = f"nb-{nb_stem}-{run_tag}-{_short_id(nb_stem + run_tag, 6)}"

        shell.events.register("pre_run_cell", self._pre_run_cell)
        shell.events.register("post_run_cell", self._post_run_cell)

    # ------------------------------------------------------------------
    # Public summary API (called by the cohort runner)
    # ------------------------------------------------------------------

    def warnings_summary(self) -> dict[str, Any]:
        """Aggregate per-cell warnings into a runner-friendly dict."""
        kinds: dict[str, int] = {}
        for w in self.warnings:
            kinds[w["kind"]] = kinds.get(w["kind"], 0) + 1
        return {
            "n_cells_executed": self._exec_index,
            "n_warnings": len(self.warnings),
            "by_kind": kinds,
            "warnings": self.warnings,
        }

    # ------------------------------------------------------------------
    # IPython hooks
    # ------------------------------------------------------------------

    def _pre_run_cell(self, info) -> None:
        """Open a tracker for this cell and install it as the global one."""
        self._exec_index += 1
        src = getattr(info, "raw_cell", "") or ""
        self._cell_src_hash = hashlib.sha256(src.encode("utf-8")).hexdigest()
        self._cell_t0 = time.time()

        # Static analysis of this cell's name graph.
        loads, stores = _ast_loads_and_stores(src)
        self._cell_loads = loads
        self._cell_stores = stores
        self._cell_warnings = []
        self._cell_dep_parents = []

        # 1. Hidden-state-leak detection (Pimentel 2019, Samuel 2024).
        leaky_names = sorted(loads - stores - set(self._name_defs.keys()))
        for name in leaky_names:
            warn = {
                "kind": "hidden_state_leak",
                "cell_index": self._exec_index,
                "name": name,
                "message": (
                    f"cell {self._exec_index} reads name {name!r} that was "
                    "not defined by any earlier cell in this run; the cell "
                    "depends on prior kernel state and may not reproduce "
                    "from a fresh kernel."
                ),
            }
            self._cell_warnings.append(warn)
            self.warnings.append(warn)
            print(
                f"[scitex-notebook] WARN cell {self._exec_index}: "
                f"hidden-state name {name!r} (no defining cell in this run)",
                file=sys.stderr,
            )

        # 2. Data-dependency parent edge — most recent earlier cell that
        #    defined a name this cell loads. Falls back to ``None``
        #    (independent cell) rather than the trivial previous-executed
        #    cell, because the latter encodes nothing reproducibility-wise.
        dep_parents: list[tuple[int, str]] = []
        for name in loads:
            if name in self._name_defs:
                dep_parents.append(self._name_defs[name])
        # Sort by cell_index descending → most recent definer first.
        dep_parents.sort(key=lambda t: -t[0])
        primary_parent_sid: Optional[str] = dep_parents[0][1] if dep_parents else None
        self._cell_dep_parents = [sid for _, sid in dep_parents]

        sid = f"{self._sid_prefix}-cell-{self._exec_index:04d}"

        metadata: dict[str, Any] = {
            "scitex_notebook_magic_version": 2,
            "notebook_path": str(self.notebook_path) if self.notebook_path else None,
            "cell_index": self._exec_index,
            "cell_source_sha256": self._cell_src_hash,
            "cell_source_len": len(src),
            "loads": sorted(loads),
            "stores": sorted(stores),
            "leaky_names": leaky_names,
            "dep_parents": self._cell_dep_parents,
        }

        self._tracker = SessionTracker(
            session_id=sid,
            script_path=str(self.notebook_path) if self.notebook_path else None,
            parent_session=primary_parent_sid,
            metadata=metadata,
        )
        set_tracker(self._tracker)

    def _post_run_cell(self, result) -> None:
        """Finalize the cell's tracker and chain forward."""
        if self._tracker is None:
            return
        try:
            had_error = getattr(result, "error_in_exec", None) is not None
            status = "failed" if had_error else "success"
            exit_code = 1 if had_error else 0
            # Wall time and error flag are not yet persisted: SessionTracker
            # accepts metadata only at construction in this scitex-clew rev.
            # Attaching them as a follow-up requires a small _db.update_run
            # API; left as TODO so this magic stays a one-file change.
            self._tracker.finalize(status=status, exit_code=exit_code)

            # 3. Out-of-order execution check (Wang 2020, Pimentel 2019).
            #    IPython's execution_count is the global cell-run counter.
            #    In a clean linear run starting from a fresh kernel it
            #    increases by exactly 1 per cell and matches our local
            #    self._exec_index. Mismatches mean the user re-ran cells
            #    or skipped some — both reproducibility risks.
            ec = getattr(result, "execution_count", None)
            if ec is not None and ec != self._exec_index:
                warn = {
                    "kind": "out_of_order_execution",
                    "cell_index": self._exec_index,
                    "execution_count": ec,
                    "message": (
                        f"cell {self._exec_index} ran with execution_count={ec}; "
                        "kernel was not fresh — earlier cells were re-run or "
                        "skipped, breaking linear-from-top reproducibility."
                    ),
                }
                self._cell_warnings.append(warn)
                self.warnings.append(warn)
                print(
                    f"[scitex-notebook] WARN cell {self._exec_index}: "
                    f"out-of-order (execution_count={ec})",
                    file=sys.stderr,
                )

            # Only register name definitions if the cell ran cleanly —
            # otherwise the assignment never happened and a downstream
            # cell that reads the name should be flagged as a leak.
            if not had_error:
                for name in self._cell_stores:
                    self._name_defs[name] = (
                        self._exec_index,
                        self._tracker.session_id,
                    )
        finally:
            set_tracker(None)
            self._prev_session = self._tracker.session_id
            self._tracker = None
            self._cell_t0 = None
            self._cell_src_hash = None
            self._cell_loads = set()
            self._cell_stores = set()


# ---------------------------------------------------------------------------
# IPython extension entry point
# ---------------------------------------------------------------------------

_INSTALLED: Optional[ScitexNotebookMagics] = None


[docs] def load_ipython_extension(ipython) -> None: """``%load_ext scitex_notebook`` enters here. We attach the per-cell hook installer to ``ipython.user_ns`` under a private name so the user can introspect it (``_scitex_nb_magic``) but isn't tempted to call its internals. """ global _INSTALLED if _INSTALLED is not None: return # idempotent _INSTALLED = ScitexNotebookMagics(ipython) ipython.user_ns["_scitex_nb_magic"] = _INSTALLED print( "[scitex-notebook] cell-level Clew tracking enabled " f"(notebook: {_INSTALLED.notebook_path or 'unknown'})" )
[docs] def unload_ipython_extension(ipython) -> None: """``%unload_ext scitex_notebook``.""" global _INSTALLED if _INSTALLED is None: return try: ipython.events.unregister("pre_run_cell", _INSTALLED._pre_run_cell) ipython.events.unregister("post_run_cell", _INSTALLED._post_run_cell) except Exception: pass ipython.user_ns.pop("_scitex_nb_magic", None) _INSTALLED = None
__all__ = ["load_ipython_extension", "unload_ipython_extension"] # EOF