From ff915c2d2f8af295ffd168018ef7736cd20975c1 Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Fri, 12 Jun 2026 21:54:16 +0000 Subject: [PATCH 01/12] docs(specs): add pydantic config loader design for ENG-607 Reusable pydantic-backed config facility: load_pydantic_config (validate YAML -> typed model at build time) plus a semantic-type converter so a validated config flows into pods as a first-class, JSON-hashed input and is auto-deserialized to the typed model. Schema lives in the wrapped package's config/ subpackage; YAML stays the authoring format. Co-Authored-By: Claude Opus 4.8 (1M context) --- ...026-06-12-pydantic-config-loader-design.md | 169 ++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 superpowers/specs/2026-06-12-pydantic-config-loader-design.md diff --git a/superpowers/specs/2026-06-12-pydantic-config-loader-design.md b/superpowers/specs/2026-06-12-pydantic-config-loader-design.md new file mode 100644 index 00000000..c7d9cfc8 --- /dev/null +++ b/superpowers/specs/2026-06-12-pydantic-config-loader-design.md @@ -0,0 +1,169 @@ +# Design: Strongly-typed, schema-validated pipeline config loading (ENG-607) + +**Status:** Approved (design) +**Date:** 2026-06-12 +**Linear:** ENG-607 (project: Wrap other RawData → ETL repos in Orcapod) +**Related:** ENG-601 (config is a content-hashed broadcast pod input), PLT-964 (`OrcapodConfig` nested-config pattern) + +## Overview + +Pipelines wrapped in orcapod (starting with orcapod-spikesorting) are driven by large, +deeply-nested YAML config files — the spike-sorting config alone is ~9 top-level sections and +hundreds of parameters. Today such a config is loaded with `yaml.safe_load` into a plain `dict` +and accessed by string keys throughout the wrapper and its `enigma-ephys` pods. There is no +validation or typing: a typo'd key, a wrong type, or a missing nested field surfaces as a deep +failure mid-processing (often only on a Ray worker) or as a silently wrong result. + +This design adds a **reusable, pydantic-backed config facility in orcapod-python** so a wrapped +pipeline can: (1) define its config **schema** once as pydantic models, (2) **validate** a YAML +config against that schema and load it into a **typed object** at pipeline-build time, and +(3) pass that validated object into function pods as a **first-class, content-hashed input**, +where pods receive it already deserialized and typed. + +## Goals & Success Criteria + +- Loading an invalid config fails **immediately at build time** (before any pod runs) with a + clear, field-located error (wrong type, unknown key, out-of-range value, missing required). +- Pods receive the config as a **typed pydantic model** (attribute access, IDE/type-checker + support), not an untyped dict — with **no per-pod deserialization boilerplate**. +- The config's pod-input **content hash is over its validated, canonical meaning**, so + formatting-only YAML edits (comments, key order, whitespace) do **not** bust the cache; only + meaningful value changes do. (An improvement over ENG-601's raw-file hashing.) +- The facility is **reusable** across every wrapped ETL repo, not specific to spike-sorting. + +## Authoring model + +Two artifacts, complementary (not redundant): + +- **Schema** — pydantic model classes, written once by the pipeline developer, living in a + `config/` subpackage of the wrapped repo (e.g. `orcapod_spikesorting/config/`). Defines + structure, types, constraints, and defaults. Changes rarely. +- **Values** — the YAML file a scientist edits per run (e.g. `subset_data`, `cache_path`). + YAML remains the human-authoring format; nobody hand-writes a pydantic object to configure a + run. + +## Architecture & data flow + +``` +Author (YAML) Pipeline build (driver) Pod (worker) +───────────── ─────────────────────── ──────────── +config.yaml ──▶ load_pydantic_config(path, SpikeSortingConfig) def preprocess(rec, config: SpikeSortingConfig): + → yaml.safe_load config.kilosort.batch_size # typed + validated + → model.model_validate(...) ... + → SpikeSortingConfig instance + │ + ▼ + broadcast as a pod input via a dict/list source. + A registered semantic-type converter maps the + model ⇄ Arrow struct; + content hash = hash(qualname + canonical JSON). + │ identity = meaning of config, not file formatting + ▼ + orcapod hashes + transports; worker reconstructs + the typed model from the struct automatically. +``` + +## Components & API + +All new code lives in a new module `orcapod/pydantic_config.py` (loader + converter), kept +separate from `orcapod/config.py` (which is orcapod's *own* `OrcapodConfig` runtime settings — +a different concept). `pydantic` becomes a dependency of orcapod-python (pydantic v2). + +### `load_pydantic_config` + +```python +M = TypeVar("M", bound=pydantic.BaseModel) + +def load_pydantic_config(path: str | Path, model_cls: type[M]) -> M: + """Read a YAML file, validate it against `model_cls`, return the typed model. + + Raises a clear, file-located error on invalid YAML or schema violation. + """ +``` + +- Named `load_pydantic_config` (not `load_config`, which is already a top-level export for + `OrcapodConfig`) to avoid collision and to be explicit that it is pydantic-backed. +- Reads YAML via `yaml.safe_load`, validates via `model_cls.model_validate(data)`, returns the + instance. On `pydantic.ValidationError` (or YAML parse error), re-raise wrapped with the file + path for context. + +### `OrcapodBaseConfig` (optional base) + +```python +class OrcapodBaseConfig(pydantic.BaseModel): + """Recommended base for pipeline config schemas; strict by default.""" + model_config = ConfigDict(extra="forbid", frozen=True) +``` + +- `extra="forbid"` makes typo'd/unknown keys an error. `frozen=True` makes instances immutable + (safer as a broadcast input). Use is recommended, not required — a schema author may subclass + `pydantic.BaseModel` directly if they need different semantics. + +### `PydanticModelConverter` (semantic-type converter) + +Modeled directly on `PathStructConverter` (which maps `Path` ⇄ `struct` +with file-content hashing). Registered in the `DataContext` semantic registry. + +- `can_handle_python_type(t)`: `issubclass(t, pydantic.BaseModel)`. +- **python → arrow:** `struct<__pydantic_model__: large_string, __pydantic_json__: large_string>` + where `__pydantic_model__` is the model's fully-qualified `module:qualname` and + `__pydantic_json__` is canonical JSON (`model.model_dump_json()`; deterministic field order). +- **arrow → python:** import the class from the stored qualname and + `model_cls.model_validate_json(json)`. Self-describing — no external type context needed. +- **content hash:** hash over `(__pydantic_model__, canonical JSON)`, so identity tracks config + meaning + schema identity, independent of source-YAML formatting. + +### Pipeline wiring (in the wrapped repo) + +The broadcast config source is built from the **validated model instance** (via a dict/list +source, whose values route through the type converter) rather than a `Path` `DataFrameSource`. +Pods declare a parameter typed as the model (`config: SpikeSortingConfig`) and orcapod injects +the reconstructed, validated model — handling transport, hashing, and deserialization. + +## Error handling + +- **Invalid YAML / schema violation:** raised at build time, before any pod runs, with the file + path and pydantic's field-level detail. +- **Unimportable model class on reconstruction (worker):** clear `ImportError` naming the stored + qualname (e.g. the wrapped package isn't on the worker path). + +## Testing + +- **Loader:** valid config → model; wrong-type, unknown-key (with `extra="forbid"`), and + missing-required configs → raise with clear messages including the file path. +- **Converter round-trip:** `model → arrow struct → model` equality. +- **Hash stability:** formatting-only YAML edits → identical content hash (cache still hits); + any value change → different hash. +- **End-to-end:** a small pipeline where a pod consumes a typed config; confirm the pod receives + the model and that a formatting-only config edit yields a cache hit. + +## Scope & boundaries + +**In scope (orcapod-python, this work):** +- `orcapod/pydantic_config.py`: `load_pydantic_config`, `OrcapodBaseConfig`, + `PydanticModelConverter`. +- Register the converter in the `DataContext` semantic registry. +- Add `pydantic` (v2) as a dependency; tests; docs. + +**Out of scope (orcapod-spikesorting follow-up):** +- Define a `config/` subpackage of pydantic models for the spike-sorting config. +- Swap the broadcast config source from a `Path` `DataFrameSource` to the validated model + source; annotate pods with the model type. +- Migrate `enigma-ephys` dict-key call sites. Eased by handing existing functions + `config.model_dump()` (a plain dict) during transition, so the migration can be incremental. + +## Dependencies & risks + +- Adds `pydantic` v2 as an orcapod-python dependency (intended). +- Content-hash semantics change for configs (meaning-based, not file-bytes). This is desired but + means existing caches keyed on the old `Path`-file hash won't match — a one-time recompute when + a pipeline migrates to the typed config. Document this. +- Reconstruction requires the model class to be importable on workers (already true for wrapped + packages shipped via Ray `py_modules`). + +## Deferred / not now (YAGNI) + +- JSON-Schema export for docs/tooling — available for free via `model.model_json_schema()` if/when + wanted; not built now. +- A standalone external schema file (XSD/JSON Schema) as the source of truth — rejected in favor + of pydantic models as the single source of truth (avoids a duplicated, drift-prone schema). From 2e7060f86409ac82d1ce8bca29153b2657720d88 Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Fri, 12 Jun 2026 22:09:32 +0000 Subject: [PATCH 02/12] docs(plans): add pydantic config loader implementation plan (ENG-607) Task-by-task TDD plan: pydantic dependency, load_pydantic_config + OrcapodBaseConfig, PydanticModelConverter semantic type, hash-stability tests, and registration in the production (v0.1.json) and standalone registries. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../2026-06-12-pydantic-config-loader.md | 584 ++++++++++++++++++ 1 file changed, 584 insertions(+) create mode 100644 superpowers/plans/2026-06-12-pydantic-config-loader.md diff --git a/superpowers/plans/2026-06-12-pydantic-config-loader.md b/superpowers/plans/2026-06-12-pydantic-config-loader.md new file mode 100644 index 00000000..fb5665cb --- /dev/null +++ b/superpowers/plans/2026-06-12-pydantic-config-loader.md @@ -0,0 +1,584 @@ +# Pydantic Config Loader Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a reusable, pydantic-backed config facility to orcapod-python: validate a YAML config against a pydantic schema into a typed model at build time, and make a validated model a first-class, content-hashed orcapod value that pods receive already deserialized. + +**Architecture:** A new module `orcapod/pydantic_config.py` provides `load_pydantic_config()` (YAML → validated model), an optional strict base `OrcapodBaseConfig`, and a `PydanticModelConverter` semantic-type converter modeled on `PythonPathStructConverter`. The converter maps any `pydantic.BaseModel` ⇄ an Arrow struct holding the model's fully-qualified class name plus canonical JSON, content-hashing the canonical JSON so identity tracks config *meaning*, not YAML formatting. The converter is registered in the production semantic registry (`contexts/data/v0.1.json`) so the existing `UniversalTypeConverter` and `StarfixArrowHasher` pick it up automatically. + +**Tech Stack:** Python 3.12, pydantic v2, PyArrow, PyYAML, pytest, uv. + +**Spec:** `superpowers/specs/2026-06-12-pydantic-config-loader-design.md` (ENG-607). + +**Conventions:** Run everything via `uv run`. Google-style docstrings, no ReST roles. Conventional Commits. End commit messages with the `Co-Authored-By: Claude Opus 4.8 (1M context) ` trailer. + +--- + +## File Structure + +- Create: `src/orcapod/pydantic_config.py` — `load_pydantic_config`, `OrcapodBaseConfig`, `PydanticModelConverter`. +- Create: `tests/test_pydantic_config.py` — loader + converter unit/integration tests. +- Modify: `pyproject.toml` — add `pydantic>=2` to `dependencies`. +- Modify: `src/orcapod/contexts/data/v0.1.json` — register the `pydantic` converter in `semantic_registry.converters` (production path). +- Modify: `src/orcapod/hashing/versioned_hashers.py:135-138` — register the converter in the standalone fallback registry for consistency. + +--- + +### Task 1: Add the pydantic dependency + +**Files:** +- Modify: `pyproject.toml` (the `dependencies` list, ~line 9-28) + +- [ ] **Step 1: Add the dependency** + +In `pyproject.toml`, add to the `dependencies` array (e.g. after the `"deltalake>=1.0.2",` line): + +```toml + "pydantic>=2", +``` + +- [ ] **Step 2: Sync the environment** + +Run: `uv sync` +Expected: resolves and installs pydantic 2.x with no conflict. + +- [ ] **Step 3: Verify import** + +Run: `uv run python -c "import pydantic; print(pydantic.VERSION)"` +Expected: prints a `2.x` version string. + +- [ ] **Step 4: Commit** + +```bash +git add pyproject.toml uv.lock +git commit -m "chore(deps): add pydantic for typed config loading (ENG-607)" +``` + +--- + +### Task 2: `load_pydantic_config` + `OrcapodBaseConfig` + +**Files:** +- Create: `src/orcapod/pydantic_config.py` +- Test: `tests/test_pydantic_config.py` + +- [ ] **Step 1: Write the failing tests** + +Create `tests/test_pydantic_config.py`: + +```python +"""Tests for orcapod.pydantic_config (ENG-607).""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from orcapod.pydantic_config import OrcapodBaseConfig, load_pydantic_config + + +class SampleConfig(OrcapodBaseConfig): + name: str + threshold: float + retries: int = 3 + + +def _write(tmp_path: Path, text: str) -> Path: + p = tmp_path / "config.yaml" + p.write_text(text, encoding="utf-8") + return p + + +def test_loads_valid_config(tmp_path): + path = _write(tmp_path, "name: run1\nthreshold: 6.0\n") + cfg = load_pydantic_config(path, SampleConfig) + assert isinstance(cfg, SampleConfig) + assert cfg.name == "run1" + assert cfg.threshold == 6.0 + assert cfg.retries == 3 # default applied + + +def test_wrong_type_raises_with_path(tmp_path): + path = _write(tmp_path, "name: run1\nthreshold: not-a-number\n") + with pytest.raises(ValueError) as exc: + load_pydantic_config(path, SampleConfig) + assert "threshold" in str(exc.value) + assert str(path) in str(exc.value) + + +def test_unknown_key_raises(tmp_path): + path = _write(tmp_path, "name: run1\nthreshold: 6.0\ntypo_key: 1\n") + with pytest.raises(ValueError) as exc: + load_pydantic_config(path, SampleConfig) + assert "typo_key" in str(exc.value) + + +def test_missing_required_raises(tmp_path): + path = _write(tmp_path, "threshold: 6.0\n") + with pytest.raises(ValueError) as exc: + load_pydantic_config(path, SampleConfig) + assert "name" in str(exc.value) +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/test_pydantic_config.py -q` +Expected: FAIL — `ModuleNotFoundError: No module named 'orcapod.pydantic_config'`. + +- [ ] **Step 3: Implement the loader + base** + +Create `src/orcapod/pydantic_config.py`: + +```python +"""Pydantic-backed config loading for orcapod pipelines (ENG-601 / ENG-607). + +Provides `load_pydantic_config` (validate a YAML file against a pydantic model) +and `OrcapodBaseConfig` (a strict base for config schemas). A companion +`PydanticModelConverter` (also in this module) makes a validated model a +first-class, content-hashed orcapod value. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import TypeVar + +import pydantic +import yaml + +M = TypeVar("M", bound=pydantic.BaseModel) + + +class OrcapodBaseConfig(pydantic.BaseModel): + """Recommended base for pipeline config schemas. + + Defaults to strict validation: unknown keys are rejected and instances are + immutable. Subclass this for pipeline configs; subclass `pydantic.BaseModel` + directly only when different semantics are required. + """ + + model_config = pydantic.ConfigDict(extra="forbid", frozen=True) + + +def load_pydantic_config(path: str | Path, model_cls: type[M]) -> M: + """Read a YAML file and validate it against a pydantic model. + + Args: + path: Path to the YAML config file. + model_cls: The pydantic model class to validate against. + + Returns: + A validated instance of `model_cls`. + + Raises: + ValueError: If the YAML cannot be parsed or fails validation. The error + message includes the file path and the underlying field-level detail. + """ + path = Path(path) + try: + with open(path, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) + except yaml.YAMLError as e: + raise ValueError(f"Could not parse YAML config {path}: {e}") from e + + try: + return model_cls.model_validate(data) + except pydantic.ValidationError as e: + raise ValueError(f"Config validation failed for {path}:\n{e}") from e +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `uv run pytest tests/test_pydantic_config.py -q` +Expected: PASS (4 passed). + +- [ ] **Step 5: Commit** + +```bash +git add src/orcapod/pydantic_config.py tests/test_pydantic_config.py +git commit -m "feat(pydantic_config): add load_pydantic_config and OrcapodBaseConfig (ENG-607)" +``` + +--- + +### Task 3: `PydanticModelConverter` — model ⇄ Arrow struct round-trip + +**Files:** +- Modify: `src/orcapod/pydantic_config.py` +- Test: `tests/test_pydantic_config.py` + +- [ ] **Step 1: Write the failing tests** + +Append to `tests/test_pydantic_config.py`: + +```python +import pyarrow as pa + +from orcapod.pydantic_config import PydanticModelConverter + + +def _converter() -> PydanticModelConverter: + return PydanticModelConverter() + + +def test_converter_python_type_and_struct_signature(): + conv = _converter() + assert conv.python_type is pydantic.BaseModel + sig = conv.arrow_struct_type + assert pa.types.is_struct(sig) + assert {f.name for f in sig} == {"__pydantic_model__", "__pydantic_json__"} + assert all(f.type == pa.large_string() for f in sig) + + +def test_converter_can_handle_model_subclass(): + conv = _converter() + assert conv.can_handle_python_type(SampleConfig) is True + assert conv.can_handle_python_type(int) is False + + +def test_converter_roundtrip_model_to_struct_to_model(): + conv = _converter() + cfg = SampleConfig(name="run1", threshold=6.0, retries=5) + struct = conv.python_to_struct_dict(cfg) + assert set(struct.keys()) == {"__pydantic_model__", "__pydantic_json__"} + assert struct["__pydantic_model__"].endswith(":SampleConfig") + restored = conv.struct_dict_to_python(struct) + assert isinstance(restored, SampleConfig) + assert restored == cfg + + +def test_converter_can_handle_struct_type_and_is_semantic_struct(): + conv = _converter() + assert conv.can_handle_struct_type(conv.arrow_struct_type) is True + assert conv.can_handle_struct_type(pa.struct([pa.field("path", pa.large_string())])) is False + cfg = SampleConfig(name="x", threshold=1.0) + assert conv.is_semantic_struct(conv.python_to_struct_dict(cfg)) is True + assert conv.is_semantic_struct({"path": "/tmp/x"}) is False +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/test_pydantic_config.py -q` +Expected: FAIL — `ImportError: cannot import name 'PydanticModelConverter'`. + +- [ ] **Step 3: Implement the converter** + +Append to `src/orcapod/pydantic_config.py`: + +```python +import importlib +from typing import Any + +from orcapod.semantic_types.semantic_struct_converters import ( + SemanticStructConverterBase, +) + +# Arrow struct field names for the serialized config. +_MODEL_FIELD = "__pydantic_model__" # fully-qualified "module:QualName" +_JSON_FIELD = "__pydantic_json__" # canonical JSON of the model + + +def _qualified_name(cls: type) -> str: + return f"{cls.__module__}:{cls.__qualname__}" + + +def _import_model(qualified_name: str) -> type[pydantic.BaseModel]: + module_path, _, qualname = qualified_name.partition(":") + module = importlib.import_module(module_path) + obj: Any = module + for part in qualname.split("."): + obj = getattr(obj, part) + return obj + + +class PydanticModelConverter(SemanticStructConverterBase): + """Semantic-type converter for pydantic models. + + Maps any `pydantic.BaseModel` instance to an Arrow struct holding the + model's fully-qualified class name and its canonical JSON, and back. Content + is hashed over (class name + canonical JSON), so identity tracks the config's + meaning rather than source-file formatting. Modeled on `PythonPathStructConverter`. + """ + + def __init__(self) -> None: + super().__init__("pydantic") + import pyarrow as pa + + self._arrow_struct_type = pa.struct( + [ + pa.field(_MODEL_FIELD, pa.large_string()), + pa.field(_JSON_FIELD, pa.large_string()), + ] + ) + + @property + def python_type(self) -> type: + return pydantic.BaseModel + + @property + def arrow_struct_type(self) -> "Any": + return self._arrow_struct_type + + def can_handle_python_type(self, python_type: type) -> bool: + return isinstance(python_type, type) and issubclass( + python_type, pydantic.BaseModel + ) + + def can_handle_struct_type(self, struct_type: "Any") -> bool: + import pyarrow as pa + + if not pa.types.is_struct(struct_type): + return False + for field in self._arrow_struct_type: + if ( + field.name not in struct_type.names + or struct_type[field.name].type != field.type + ): + return False + return True + + def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool: + return set(struct_dict.keys()) == {_MODEL_FIELD, _JSON_FIELD} + + def python_to_struct_dict(self, value: Any) -> dict[str, Any]: + if not isinstance(value, pydantic.BaseModel): + raise TypeError(f"Expected a pydantic BaseModel, got {type(value)}") + return { + _MODEL_FIELD: _qualified_name(type(value)), + _JSON_FIELD: value.model_dump_json(), + } + + def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> Any: + qualified_name = struct_dict.get(_MODEL_FIELD) + json_str = struct_dict.get(_JSON_FIELD) + if qualified_name is None or json_str is None: + raise ValueError( + f"Missing '{_MODEL_FIELD}'/'{_JSON_FIELD}' in struct dict" + ) + model_cls = _import_model(qualified_name) + return model_cls.model_validate_json(json_str) + + def hash_struct_dict( + self, struct_dict: dict[str, Any], add_prefix: bool = False + ) -> str: + qualified_name = struct_dict.get(_MODEL_FIELD) + json_str = struct_dict.get(_JSON_FIELD) + if qualified_name is None or json_str is None: + raise ValueError( + f"Missing '{_MODEL_FIELD}'/'{_JSON_FIELD}' in struct dict" + ) + content = f"{qualified_name}\n{json_str}".encode("utf-8") + content_hash = self._compute_content_hash(content) + return self._format_hash_string(content_hash.digest, add_prefix=add_prefix) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `uv run pytest tests/test_pydantic_config.py -q` +Expected: PASS (all tests, including Task 2's). + +- [ ] **Step 5: Commit** + +```bash +git add src/orcapod/pydantic_config.py tests/test_pydantic_config.py +git commit -m "feat(pydantic_config): add PydanticModelConverter semantic type (ENG-607)" +``` + +--- + +### Task 4: Hash stability — meaning, not formatting + +**Files:** +- Test: `tests/test_pydantic_config.py` + +- [ ] **Step 1: Write the failing tests** + +Append to `tests/test_pydantic_config.py`: + +```python +def test_hash_equal_for_equal_values(): + conv = _converter() + a = conv.python_to_struct_dict(SampleConfig(name="run1", threshold=6.0, retries=5)) + b = conv.python_to_struct_dict(SampleConfig(name="run1", threshold=6.0, retries=5)) + assert conv.hash_struct_dict(a) == conv.hash_struct_dict(b) + + +def test_hash_differs_for_different_values(): + conv = _converter() + a = conv.python_to_struct_dict(SampleConfig(name="run1", threshold=6.0)) + b = conv.python_to_struct_dict(SampleConfig(name="run1", threshold=7.0)) + assert conv.hash_struct_dict(a) != conv.hash_struct_dict(b) + + +def test_hash_stable_across_yaml_formatting(tmp_path): + # Two YAMLs that differ only in comments / key order / whitespace + # must produce the same validated model and therefore the same hash. + yaml_a = "name: run1\nthreshold: 6.0\nretries: 5\n" + yaml_b = "# a comment\nretries: 5\nthreshold: 6.0\nname: run1\n" + pa_path = _write(tmp_path, yaml_a) + cfg_a = load_pydantic_config(pa_path, SampleConfig) + pb_path = tmp_path / "b.yaml" + pb_path.write_text(yaml_b, encoding="utf-8") + cfg_b = load_pydantic_config(pb_path, SampleConfig) + + conv = _converter() + ha = conv.hash_struct_dict(conv.python_to_struct_dict(cfg_a)) + hb = conv.hash_struct_dict(conv.python_to_struct_dict(cfg_b)) + assert ha == hb +``` + +- [ ] **Step 2: Run tests** + +Run: `uv run pytest tests/test_pydantic_config.py -q` +Expected: PASS — the implementation from Task 3 already satisfies these (no new code needed). If `test_hash_stable_across_yaml_formatting` fails, it indicates `model_dump_json()` is non-deterministic for this model; investigate before proceeding. + +- [ ] **Step 3: Commit** + +```bash +git add tests/test_pydantic_config.py +git commit -m "test(pydantic_config): assert hash tracks config meaning, not formatting (ENG-607)" +``` + +--- + +### Task 5: Register the converter in the production + standalone registries + +**Files:** +- Modify: `src/orcapod/contexts/data/v0.1.json` (the `semantic_registry` → `_config` → `converters` object) +- Modify: `src/orcapod/hashing/versioned_hashers.py:135-138` +- Test: `tests/test_pydantic_config.py` + +- [ ] **Step 1: Write the failing integration test** + +Append to `tests/test_pydantic_config.py`: + +```python +from orcapod.contexts import get_default_context +from orcapod.types import Schema + + +def test_registered_in_default_context_roundtrip(): + ctx = get_default_context() + converter = ctx.type_converter + + cfg = SampleConfig(name="run1", threshold=6.0, retries=5) + table = converter.python_dicts_to_arrow_table( + [{"config": cfg}], python_schema=Schema({"config": SampleConfig}) + ) + # Stored as the pydantic struct, not an opaque blob. + assert pa.types.is_struct(table.schema.field("config").type) + assert {f.name for f in table.schema.field("config").type} == { + "__pydantic_model__", + "__pydantic_json__", + } + + restored = converter.arrow_table_to_python_dicts(table) + assert isinstance(restored[0]["config"], SampleConfig) + assert restored[0]["config"] == cfg + + +def test_default_context_hashes_model_stably(): + ctx = get_default_context() + converter = ctx.type_converter + schema = Schema({"config": SampleConfig}) + t1 = converter.python_dicts_to_arrow_table( + [{"config": SampleConfig(name="r", threshold=6.0)}], python_schema=schema + ) + t2 = converter.python_dicts_to_arrow_table( + [{"config": SampleConfig(name="r", threshold=6.0)}], python_schema=schema + ) + h1 = ctx.arrow_hasher.hash_table(t1) + h2 = ctx.arrow_hasher.hash_table(t2) + assert h1 == h2 +``` + +Note: if `arrow_hasher` exposes a different method than `hash_table`, adjust the last two lines to the actual public hashing entry point (confirm by reading `ctx.arrow_hasher`'s class). The first test is the load-bearing one. + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/test_pydantic_config.py -k "default_context" -q` +Expected: FAIL — the converter is not yet registered, so `python_dicts_to_arrow_table` does not produce the pydantic struct (it errors or produces a non-struct column). + +- [ ] **Step 3: Register in the production JSON registry** + +In `src/orcapod/contexts/data/v0.1.json`, inside `semantic_registry._config.converters` (alongside `"path"` and `"upath"`), add: + +```json + "pydantic": { + "_class": "orcapod.pydantic_config.PydanticModelConverter", + "_config": {} + } +``` + +(Place it as a sibling key; mind the trailing commas so the JSON stays valid.) + +- [ ] **Step 4: Register in the standalone fallback registry** + +In `src/orcapod/hashing/versioned_hashers.py`, after the existing `registry.register_converter("path", path_converter)` (line ~138), add: + +```python + from orcapod.pydantic_config import PydanticModelConverter + + registry.register_converter("pydantic", PydanticModelConverter()) +``` + +- [ ] **Step 5: Run tests to verify they pass** + +Run: `uv run pytest tests/test_pydantic_config.py -q` +Expected: PASS (all tests). + +- [ ] **Step 6: Run the semantic-types + contexts suites for regressions** + +Run: `uv run pytest tests/test_semantic_types tests/test_hashing -q` +Expected: PASS (no regressions from the new registration). + +- [ ] **Step 7: Commit** + +```bash +git add src/orcapod/contexts/data/v0.1.json src/orcapod/hashing/versioned_hashers.py tests/test_pydantic_config.py +git commit -m "feat(pydantic_config): register PydanticModelConverter in default registries (ENG-607)" +``` + +--- + +### Task 6: Full-suite verification + DESIGN_ISSUES note + +**Files:** +- Modify: `DESIGN_ISSUES.md` (optional — only if a matching issue exists; otherwise skip) + +- [ ] **Step 1: Run the full test suite** + +Run: `uv run pytest -m "not postgres" -q` +Expected: PASS (no regressions). Note skip counts as normal. + +- [ ] **Step 2: Type-check the new module (if the repo runs a type checker in CI)** + +Run: `uv run python -c "import orcapod.pydantic_config"` +Expected: imports cleanly. (If the repo uses pyright/mypy in CI, run that on `src/orcapod/pydantic_config.py` and fix any issues.) + +- [ ] **Step 3: Final commit (only if Step 2 required edits)** + +```bash +git add -A +git commit -m "chore(pydantic_config): satisfy type checker (ENG-607)" +``` + +--- + +## Self-Review + +**Spec coverage:** +- Reusable loader in orcapod-python → Task 2 (`load_pydantic_config`). ✓ +- Validate at build time, clear field-located error → Task 2 tests (wrong type / unknown key / missing required, path in message). ✓ +- Typed config is a first-class, content-hashed pod input → Task 3 (converter) + Task 5 (registration; round-trip + struct storage). ✓ +- Pods receive the typed model with no per-pod deserialization → Task 5 `test_registered_in_default_context_roundtrip` proves automatic reconstruction via the type converter (the actual pod parameter wiring is the spike-sorting follow-up, out of scope here). ✓ +- Hash over meaning, not formatting → Task 4 (`test_hash_stable_across_yaml_formatting`). ✓ +- `OrcapodBaseConfig` strict base → Task 2. ✓ +- Add pydantic dependency → Task 1. ✓ + +**Out of scope (correctly deferred):** spike-sorting `config/` models, source swap, pod annotations, enigma-ephys migration (separate follow-up per spec). + +**Type consistency:** `PydanticModelConverter` uses `_MODEL_FIELD`/`_JSON_FIELD` consistently across `python_to_struct_dict`, `struct_dict_to_python`, `is_semantic_struct`, and `hash_struct_dict`. `python_type` returns `pydantic.BaseModel`; registry subclass-matching handles concrete subclasses (verified against `SemanticTypeRegistry.get_converter_for_python_type`). + +**Known verification point:** Task 5 Step 1 notes the `arrow_hasher` hashing method name (`hash_table`) must be confirmed against the concrete hasher class; the primary round-trip assertion does not depend on it. From 8bbc5c10d19ef33c704a142dbdc6c403d2224d24 Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Fri, 12 Jun 2026 22:16:25 +0000 Subject: [PATCH 03/12] chore(deps): add pydantic for typed config loading (ENG-607) Co-Authored-By: Claude Opus 4.8 (1M context) --- pyproject.toml | 1 + uv.lock | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b3c772ef..b4a6e2a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "polars>=1.31.0", "beartype>=0.21.0", "deltalake>=1.0.2", + "pydantic>=2", "graphviz>=0.21", "gitpython>=3.1.45", "universal-pathlib>=0.3.8", diff --git a/uv.lock b/uv.lock index 446f7ebf..f9e9d0d8 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.11.0" resolution-markers = [ "python_full_version >= '3.14' and sys_platform == 'darwin'", @@ -2301,6 +2301,7 @@ dependencies = [ { name = "pandas" }, { name = "polars" }, { name = "pyarrow" }, + { name = "pydantic" }, { name = "pygraphviz" }, { name = "pymongo" }, { name = "pyyaml" }, @@ -2380,6 +2381,7 @@ requires-dist = [ { name = "psycopg", extras = ["binary"], marker = "extra == 'all'", specifier = ">=3.0" }, { name = "psycopg", extras = ["binary"], marker = "extra == 'postgresql'", specifier = ">=3.0" }, { name = "pyarrow", specifier = ">=20.0.0" }, + { name = "pydantic", specifier = ">=2" }, { name = "pygraphviz", specifier = ">=1.14" }, { name = "pymongo", specifier = ">=4.15.5" }, { name = "pyspiral", marker = "extra == 'all'", specifier = ">=0.11.0" }, From 1afbc52c065b6f500c9faeeaf574ada6827daf36 Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Fri, 12 Jun 2026 22:22:24 +0000 Subject: [PATCH 04/12] feat(pydantic_config): add load_pydantic_config and OrcapodBaseConfig (ENG-607) Co-Authored-By: Claude Opus 4.8 (1M context) --- src/orcapod/pydantic_config.py | 55 ++++++++++++++++++++++++++++++++++ tests/test_pydantic_config.py | 52 ++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 src/orcapod/pydantic_config.py create mode 100644 tests/test_pydantic_config.py diff --git a/src/orcapod/pydantic_config.py b/src/orcapod/pydantic_config.py new file mode 100644 index 00000000..7707c7dd --- /dev/null +++ b/src/orcapod/pydantic_config.py @@ -0,0 +1,55 @@ +"""Pydantic-backed config loading for orcapod pipelines (ENG-601 / ENG-607). + +Provides `load_pydantic_config` (validate a YAML file against a pydantic model) +and `OrcapodBaseConfig` (a strict base for config schemas). A companion +`PydanticModelConverter` (also in this module) makes a validated model a +first-class, content-hashed orcapod value. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import TypeVar + +import pydantic +import yaml + +M = TypeVar("M", bound=pydantic.BaseModel) + + +class OrcapodBaseConfig(pydantic.BaseModel): + """Recommended base for pipeline config schemas. + + Defaults to strict validation: unknown keys are rejected and instances are + immutable. Subclass this for pipeline configs; subclass `pydantic.BaseModel` + directly only when different semantics are required. + """ + + model_config = pydantic.ConfigDict(extra="forbid", frozen=True) + + +def load_pydantic_config(path: str | Path, model_cls: type[M]) -> M: + """Read a YAML file and validate it against a pydantic model. + + Args: + path: Path to the YAML config file. + model_cls: The pydantic model class to validate against. + + Returns: + A validated instance of `model_cls`. + + Raises: + ValueError: If the YAML cannot be parsed or fails validation. The error + message includes the file path and the underlying field-level detail. + """ + path = Path(path) + try: + with open(path, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) + except yaml.YAMLError as e: + raise ValueError(f"Could not parse YAML config {path}: {e}") from e + + try: + return model_cls.model_validate(data) + except pydantic.ValidationError as e: + raise ValueError(f"Config validation failed for {path}:\n{e}") from e diff --git a/tests/test_pydantic_config.py b/tests/test_pydantic_config.py new file mode 100644 index 00000000..894b9b9b --- /dev/null +++ b/tests/test_pydantic_config.py @@ -0,0 +1,52 @@ +"""Tests for orcapod.pydantic_config (ENG-607).""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from orcapod.pydantic_config import OrcapodBaseConfig, load_pydantic_config + + +class SampleConfig(OrcapodBaseConfig): + name: str + threshold: float + retries: int = 3 + + +def _write(tmp_path: Path, text: str) -> Path: + p = tmp_path / "config.yaml" + p.write_text(text, encoding="utf-8") + return p + + +def test_loads_valid_config(tmp_path): + path = _write(tmp_path, "name: run1\nthreshold: 6.0\n") + cfg = load_pydantic_config(path, SampleConfig) + assert isinstance(cfg, SampleConfig) + assert cfg.name == "run1" + assert cfg.threshold == 6.0 + assert cfg.retries == 3 # default applied + + +def test_wrong_type_raises_with_path(tmp_path): + path = _write(tmp_path, "name: run1\nthreshold: not-a-number\n") + with pytest.raises(ValueError) as exc: + load_pydantic_config(path, SampleConfig) + assert "threshold" in str(exc.value) + assert str(path) in str(exc.value) + + +def test_unknown_key_raises(tmp_path): + path = _write(tmp_path, "name: run1\nthreshold: 6.0\ntypo_key: 1\n") + with pytest.raises(ValueError) as exc: + load_pydantic_config(path, SampleConfig) + assert "typo_key" in str(exc.value) + + +def test_missing_required_raises(tmp_path): + path = _write(tmp_path, "threshold: 6.0\n") + with pytest.raises(ValueError) as exc: + load_pydantic_config(path, SampleConfig) + assert "name" in str(exc.value) From 26fc1fa5458bc04c8b83b374fe699e7489d7fe8f Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Fri, 12 Jun 2026 22:28:23 +0000 Subject: [PATCH 05/12] fix(pydantic_config): wrap file IO errors as ValueError (ENG-607) Co-Authored-By: Claude Opus 4.8 (1M context) --- src/orcapod/pydantic_config.py | 2 ++ tests/test_pydantic_config.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/src/orcapod/pydantic_config.py b/src/orcapod/pydantic_config.py index 7707c7dd..88c8a0d0 100644 --- a/src/orcapod/pydantic_config.py +++ b/src/orcapod/pydantic_config.py @@ -48,6 +48,8 @@ def load_pydantic_config(path: str | Path, model_cls: type[M]) -> M: data = yaml.safe_load(f) except yaml.YAMLError as e: raise ValueError(f"Could not parse YAML config {path}: {e}") from e + except OSError as e: + raise ValueError(f"Could not read YAML config {path}: {e}") from e try: return model_cls.model_validate(data) diff --git a/tests/test_pydantic_config.py b/tests/test_pydantic_config.py index 894b9b9b..e2fc667a 100644 --- a/tests/test_pydantic_config.py +++ b/tests/test_pydantic_config.py @@ -50,3 +50,17 @@ def test_missing_required_raises(tmp_path): with pytest.raises(ValueError) as exc: load_pydantic_config(path, SampleConfig) assert "name" in str(exc.value) + + +def test_missing_file_raises_value_error(tmp_path): + missing = tmp_path / "does_not_exist.yaml" + with pytest.raises(ValueError) as exc: + load_pydantic_config(missing, SampleConfig) + assert str(missing) in str(exc.value) + + +def test_empty_file_raises_value_error(tmp_path): + path = _write(tmp_path, "") + with pytest.raises(ValueError) as exc: + load_pydantic_config(path, SampleConfig) + assert str(path) in str(exc.value) From b402058cb2b68301396438b0ad1d836122c5d834 Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Fri, 12 Jun 2026 22:33:59 +0000 Subject: [PATCH 06/12] feat(pydantic_config): add PydanticModelConverter semantic type (ENG-607) Co-Authored-By: Claude Opus 4.8 (1M context) --- src/orcapod/pydantic_config.py | 107 ++++++++++++++++++++++++++++++++- tests/test_pydantic_config.py | 43 ++++++++++++- 2 files changed, 148 insertions(+), 2 deletions(-) diff --git a/src/orcapod/pydantic_config.py b/src/orcapod/pydantic_config.py index 88c8a0d0..a5d2478c 100644 --- a/src/orcapod/pydantic_config.py +++ b/src/orcapod/pydantic_config.py @@ -8,12 +8,18 @@ from __future__ import annotations +import importlib from pathlib import Path -from typing import TypeVar +from typing import TYPE_CHECKING, Any, TypeVar import pydantic import yaml +from orcapod.semantic_types.semantic_struct_converters import SemanticStructConverterBase + +if TYPE_CHECKING: + import pyarrow as pa + M = TypeVar("M", bound=pydantic.BaseModel) @@ -55,3 +61,102 @@ def load_pydantic_config(path: str | Path, model_cls: type[M]) -> M: return model_cls.model_validate(data) except pydantic.ValidationError as e: raise ValueError(f"Config validation failed for {path}:\n{e}") from e + + +# Arrow struct field names for the serialized config. +_MODEL_FIELD = "__pydantic_model__" # fully-qualified "module:QualName" +_JSON_FIELD = "__pydantic_json__" # canonical JSON of the model + + +def _qualified_name(cls: type) -> str: + return f"{cls.__module__}:{cls.__qualname__}" + + +def _import_model(qualified_name: str) -> type[pydantic.BaseModel]: + module_path, _, qualname = qualified_name.partition(":") + module = importlib.import_module(module_path) + obj: Any = module + for part in qualname.split("."): + obj = getattr(obj, part) + return obj + + +class PydanticModelConverter(SemanticStructConverterBase): + """Semantic-type converter for pydantic models. + + Maps any `pydantic.BaseModel` instance to an Arrow struct holding the + model's fully-qualified class name and its canonical JSON, and back. Content + is hashed over (class name + canonical JSON), so identity tracks the config's + meaning rather than source-file formatting. Modeled on `PythonPathStructConverter`. + """ + + def __init__(self) -> None: + super().__init__("pydantic") + import pyarrow as pa + + self._arrow_struct_type = pa.struct( + [ + pa.field(_MODEL_FIELD, pa.large_string()), + pa.field(_JSON_FIELD, pa.large_string()), + ] + ) + + @property + def python_type(self) -> type: + return pydantic.BaseModel + + @property + def arrow_struct_type(self) -> Any: + return self._arrow_struct_type + + def can_handle_python_type(self, python_type: type) -> bool: + return isinstance(python_type, type) and issubclass( + python_type, pydantic.BaseModel + ) + + def can_handle_struct_type(self, struct_type: Any) -> bool: + import pyarrow as pa + + if not pa.types.is_struct(struct_type): + return False + for field in self._arrow_struct_type: + if ( + field.name not in struct_type.names + or struct_type[field.name].type != field.type + ): + return False + return True + + def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool: + return set(struct_dict.keys()) == {_MODEL_FIELD, _JSON_FIELD} + + def python_to_struct_dict(self, value: Any) -> dict[str, Any]: + if not isinstance(value, pydantic.BaseModel): + raise TypeError(f"Expected a pydantic BaseModel, got {type(value)}") + return { + _MODEL_FIELD: _qualified_name(type(value)), + _JSON_FIELD: value.model_dump_json(), + } + + def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> Any: + qualified_name = struct_dict.get(_MODEL_FIELD) + json_str = struct_dict.get(_JSON_FIELD) + if qualified_name is None or json_str is None: + raise ValueError( + f"Missing '{_MODEL_FIELD}'/'{_JSON_FIELD}' in struct dict" + ) + model_cls = _import_model(qualified_name) + return model_cls.model_validate_json(json_str) + + def hash_struct_dict( + self, struct_dict: dict[str, Any], add_prefix: bool = False + ) -> str: + qualified_name = struct_dict.get(_MODEL_FIELD) + json_str = struct_dict.get(_JSON_FIELD) + if qualified_name is None or json_str is None: + raise ValueError( + f"Missing '{_MODEL_FIELD}'/'{_JSON_FIELD}' in struct dict" + ) + content = f"{qualified_name}\n{json_str}".encode("utf-8") + content_hash = self._compute_content_hash(content) + return self._format_hash_string(content_hash.digest, add_prefix=add_prefix) diff --git a/tests/test_pydantic_config.py b/tests/test_pydantic_config.py index e2fc667a..98d48608 100644 --- a/tests/test_pydantic_config.py +++ b/tests/test_pydantic_config.py @@ -4,9 +4,11 @@ from pathlib import Path +import pyarrow as pa +import pydantic import pytest -from orcapod.pydantic_config import OrcapodBaseConfig, load_pydantic_config +from orcapod.pydantic_config import OrcapodBaseConfig, PydanticModelConverter, load_pydantic_config class SampleConfig(OrcapodBaseConfig): @@ -64,3 +66,42 @@ def test_empty_file_raises_value_error(tmp_path): with pytest.raises(ValueError) as exc: load_pydantic_config(path, SampleConfig) assert str(path) in str(exc.value) + + +def _converter() -> PydanticModelConverter: + return PydanticModelConverter() + + +def test_converter_python_type_and_struct_signature(): + conv = _converter() + assert conv.python_type is pydantic.BaseModel + sig = conv.arrow_struct_type + assert pa.types.is_struct(sig) + assert {f.name for f in sig} == {"__pydantic_model__", "__pydantic_json__"} + assert all(f.type == pa.large_string() for f in sig) + + +def test_converter_can_handle_model_subclass(): + conv = _converter() + assert conv.can_handle_python_type(SampleConfig) is True + assert conv.can_handle_python_type(int) is False + + +def test_converter_roundtrip_model_to_struct_to_model(): + conv = _converter() + cfg = SampleConfig(name="run1", threshold=6.0, retries=5) + struct = conv.python_to_struct_dict(cfg) + assert set(struct.keys()) == {"__pydantic_model__", "__pydantic_json__"} + assert struct["__pydantic_model__"].endswith(":SampleConfig") + restored = conv.struct_dict_to_python(struct) + assert isinstance(restored, SampleConfig) + assert restored == cfg + + +def test_converter_can_handle_struct_type_and_is_semantic_struct(): + conv = _converter() + assert conv.can_handle_struct_type(conv.arrow_struct_type) is True + assert conv.can_handle_struct_type(pa.struct([pa.field("path", pa.large_string())])) is False + cfg = SampleConfig(name="x", threshold=1.0) + assert conv.is_semantic_struct(conv.python_to_struct_dict(cfg)) is True + assert conv.is_semantic_struct({"path": "/tmp/x"}) is False From 4a809de7e2670cd9e07bf31729731d0a4a4fb630 Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Fri, 12 Jun 2026 22:44:43 +0000 Subject: [PATCH 07/12] fix(pydantic_config): clearer import errors, stricter struct check, lazy pyarrow (ENG-607) Co-Authored-By: Claude Opus 4.8 (1M context) --- src/orcapod/pydantic_config.py | 32 ++++++++++++++++++++++++-------- tests/test_pydantic_config.py | 9 +++++++++ 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/src/orcapod/pydantic_config.py b/src/orcapod/pydantic_config.py index a5d2478c..c03b053a 100644 --- a/src/orcapod/pydantic_config.py +++ b/src/orcapod/pydantic_config.py @@ -16,9 +16,12 @@ import yaml from orcapod.semantic_types.semantic_struct_converters import SemanticStructConverterBase +from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: import pyarrow as pa +else: + pa = LazyModule("pyarrow") M = TypeVar("M", bound=pydantic.BaseModel) @@ -74,10 +77,21 @@ def _qualified_name(cls: type) -> str: def _import_model(qualified_name: str) -> type[pydantic.BaseModel]: module_path, _, qualname = qualified_name.partition(":") - module = importlib.import_module(module_path) + try: + module = importlib.import_module(module_path) + except ImportError as e: + raise ImportError( + f"Cannot import module '{module_path}' for pydantic model " + f"'{qualified_name}': {e}" + ) from e obj: Any = module for part in qualname.split("."): - obj = getattr(obj, part) + try: + obj = getattr(obj, part) + except AttributeError as e: + raise ImportError( + f"Cannot resolve '{part}' in '{qualified_name}': {e}" + ) from e return obj @@ -92,8 +106,6 @@ class PydanticModelConverter(SemanticStructConverterBase): def __init__(self) -> None: super().__init__("pydantic") - import pyarrow as pa - self._arrow_struct_type = pa.struct( [ pa.field(_MODEL_FIELD, pa.large_string()), @@ -106,7 +118,7 @@ def python_type(self) -> type: return pydantic.BaseModel @property - def arrow_struct_type(self) -> Any: + def arrow_struct_type(self) -> "pa.StructType": return self._arrow_struct_type def can_handle_python_type(self, python_type: type) -> bool: @@ -115,8 +127,6 @@ def can_handle_python_type(self, python_type: type) -> bool: ) def can_handle_struct_type(self, struct_type: Any) -> bool: - import pyarrow as pa - if not pa.types.is_struct(struct_type): return False for field in self._arrow_struct_type: @@ -128,13 +138,19 @@ def can_handle_struct_type(self, struct_type: Any) -> bool: return True def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool: - return set(struct_dict.keys()) == {_MODEL_FIELD, _JSON_FIELD} + return ( + set(struct_dict.keys()) == {_MODEL_FIELD, _JSON_FIELD} + and isinstance(struct_dict[_MODEL_FIELD], str) + and isinstance(struct_dict[_JSON_FIELD], str) + ) def python_to_struct_dict(self, value: Any) -> dict[str, Any]: if not isinstance(value, pydantic.BaseModel): raise TypeError(f"Expected a pydantic BaseModel, got {type(value)}") return { _MODEL_FIELD: _qualified_name(type(value)), + # model_dump_json() serialises fields in definition order (pydantic v2), + # so equal models produce identical JSON -> stable content hash. _JSON_FIELD: value.model_dump_json(), } diff --git a/tests/test_pydantic_config.py b/tests/test_pydantic_config.py index 98d48608..1c7dae89 100644 --- a/tests/test_pydantic_config.py +++ b/tests/test_pydantic_config.py @@ -105,3 +105,12 @@ def test_converter_can_handle_struct_type_and_is_semantic_struct(): cfg = SampleConfig(name="x", threshold=1.0) assert conv.is_semantic_struct(conv.python_to_struct_dict(cfg)) is True assert conv.is_semantic_struct({"path": "/tmp/x"}) is False + + +def test_struct_dict_to_python_bad_qualname_raises_importerror(): + conv = _converter() + with pytest.raises(ImportError) as exc: + conv.struct_dict_to_python( + {"__pydantic_model__": "no.such.module:Nope", "__pydantic_json__": "{}"} + ) + assert "no.such.module:Nope" in str(exc.value) From cb8f976287fb657eaebd7ce077a23690ec2fe399 Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Fri, 12 Jun 2026 22:46:09 +0000 Subject: [PATCH 08/12] test(pydantic_config): assert hash tracks config meaning, not formatting (ENG-607) Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/test_pydantic_config.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/test_pydantic_config.py b/tests/test_pydantic_config.py index 1c7dae89..7626c350 100644 --- a/tests/test_pydantic_config.py +++ b/tests/test_pydantic_config.py @@ -114,3 +114,34 @@ def test_struct_dict_to_python_bad_qualname_raises_importerror(): {"__pydantic_model__": "no.such.module:Nope", "__pydantic_json__": "{}"} ) assert "no.such.module:Nope" in str(exc.value) + + +def test_hash_equal_for_equal_values(): + conv = _converter() + a = conv.python_to_struct_dict(SampleConfig(name="run1", threshold=6.0, retries=5)) + b = conv.python_to_struct_dict(SampleConfig(name="run1", threshold=6.0, retries=5)) + assert conv.hash_struct_dict(a) == conv.hash_struct_dict(b) + + +def test_hash_differs_for_different_values(): + conv = _converter() + a = conv.python_to_struct_dict(SampleConfig(name="run1", threshold=6.0)) + b = conv.python_to_struct_dict(SampleConfig(name="run1", threshold=7.0)) + assert conv.hash_struct_dict(a) != conv.hash_struct_dict(b) + + +def test_hash_stable_across_yaml_formatting(tmp_path): + # Two YAMLs that differ only in comments / key order / whitespace + # must produce the same validated model and therefore the same hash. + yaml_a = "name: run1\nthreshold: 6.0\nretries: 5\n" + yaml_b = "# a comment\nretries: 5\nthreshold: 6.0\nname: run1\n" + pa_path = _write(tmp_path, yaml_a) + cfg_a = load_pydantic_config(pa_path, SampleConfig) + pb_path = tmp_path / "b.yaml" + pb_path.write_text(yaml_b, encoding="utf-8") + cfg_b = load_pydantic_config(pb_path, SampleConfig) + + conv = _converter() + ha = conv.hash_struct_dict(conv.python_to_struct_dict(cfg_a)) + hb = conv.hash_struct_dict(conv.python_to_struct_dict(cfg_b)) + assert ha == hb From 3aea43d4d3daaa6709254aae902918699cad3122 Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Fri, 12 Jun 2026 22:48:39 +0000 Subject: [PATCH 09/12] feat(pydantic_config): register PydanticModelConverter in default registries (ENG-607) Co-Authored-By: Claude Opus 4.8 (1M context) --- src/orcapod/contexts/data/v0.1.json | 4 +++ src/orcapod/hashing/versioned_hashers.py | 4 +++ tests/test_pydantic_config.py | 43 ++++++++++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json index 2fb31a70..f0e31f77 100644 --- a/src/orcapod/contexts/data/v0.1.json +++ b/src/orcapod/contexts/data/v0.1.json @@ -23,6 +23,10 @@ "_config": { "file_hasher": {"_ref": "file_hasher"} } + }, + "pydantic": { + "_class": "orcapod.pydantic_config.PydanticModelConverter", + "_config": {} } } } diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index f736293b..3627be36 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -137,6 +137,10 @@ def get_versioned_semantic_arrow_hasher( path_converter: Any = PythonPathStructConverter(file_hasher=file_hasher) registry.register_converter("path", path_converter) + from orcapod.pydantic_config import PydanticModelConverter + + registry.register_converter("pydantic", PydanticModelConverter()) + logger.debug( "get_versioned_semantic_arrow_hasher: creating StarfixArrowHasher " "(hasher_id=%r)", diff --git a/tests/test_pydantic_config.py b/tests/test_pydantic_config.py index 7626c350..2d4b81aa 100644 --- a/tests/test_pydantic_config.py +++ b/tests/test_pydantic_config.py @@ -145,3 +145,46 @@ def test_hash_stable_across_yaml_formatting(tmp_path): ha = conv.hash_struct_dict(conv.python_to_struct_dict(cfg_a)) hb = conv.hash_struct_dict(conv.python_to_struct_dict(cfg_b)) assert ha == hb + + +# --------------------------------------------------------------------------- +# Integration tests — default context registry (ENG-607 Task 5) +# --------------------------------------------------------------------------- + +from orcapod.contexts import get_default_context # noqa: E402 +from orcapod.types import Schema # noqa: E402 + + +def test_registered_in_default_context_roundtrip(): + ctx = get_default_context() + converter = ctx.type_converter + + cfg = SampleConfig(name="run1", threshold=6.0, retries=5) + table = converter.python_dicts_to_arrow_table( + [{"config": cfg}], python_schema=Schema({"config": SampleConfig}) + ) + # Stored as the pydantic struct, not an opaque blob. + assert pa.types.is_struct(table.schema.field("config").type) + assert {f.name for f in table.schema.field("config").type} == { + "__pydantic_model__", + "__pydantic_json__", + } + + restored = converter.arrow_table_to_python_dicts(table) + assert isinstance(restored[0]["config"], SampleConfig) + assert restored[0]["config"] == cfg + + +def test_default_context_hashes_model_stably(): + ctx = get_default_context() + converter = ctx.type_converter + schema = Schema({"config": SampleConfig}) + t1 = converter.python_dicts_to_arrow_table( + [{"config": SampleConfig(name="r", threshold=6.0)}], python_schema=schema + ) + t2 = converter.python_dicts_to_arrow_table( + [{"config": SampleConfig(name="r", threshold=6.0)}], python_schema=schema + ) + h1 = ctx.arrow_hasher.hash_table(t1) + h2 = ctx.arrow_hasher.hash_table(t2) + assert h1 == h2 From c8784344fec52f0b09f29dc27402e662a85508d1 Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Fri, 12 Jun 2026 22:55:21 +0000 Subject: [PATCH 10/12] chore(pydantic_config): add registry-sync note, tidy test imports (ENG-607) Co-Authored-By: Claude Opus 4.8 (1M context) --- src/orcapod/hashing/versioned_hashers.py | 2 ++ tests/test_pydantic_config.py | 5 ++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index 3627be36..66abc28f 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -135,6 +135,8 @@ def get_versioned_semantic_arrow_hasher( registry: Any = SemanticTypeRegistry() file_hasher = BasicFileHasher(algorithm="sha256") path_converter: Any = PythonPathStructConverter(file_hasher=file_hasher) + # NOTE: keep this converter list in sync with the production registry in + # src/orcapod/contexts/data/v0.1.json (semantic_registry._config.converters). registry.register_converter("path", path_converter) from orcapod.pydantic_config import PydanticModelConverter diff --git a/tests/test_pydantic_config.py b/tests/test_pydantic_config.py index 2d4b81aa..d1effc91 100644 --- a/tests/test_pydantic_config.py +++ b/tests/test_pydantic_config.py @@ -8,7 +8,9 @@ import pydantic import pytest +from orcapod.contexts import get_default_context from orcapod.pydantic_config import OrcapodBaseConfig, PydanticModelConverter, load_pydantic_config +from orcapod.types import Schema class SampleConfig(OrcapodBaseConfig): @@ -151,9 +153,6 @@ def test_hash_stable_across_yaml_formatting(tmp_path): # Integration tests — default context registry (ENG-607 Task 5) # --------------------------------------------------------------------------- -from orcapod.contexts import get_default_context # noqa: E402 -from orcapod.types import Schema # noqa: E402 - def test_registered_in_default_context_roundtrip(): ctx = get_default_context() From b45d165162fb73868350a1b9c7ad11fcc8f5ce93 Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Fri, 12 Jun 2026 23:04:15 +0000 Subject: [PATCH 11/12] fix(pydantic_config): canonicalize JSON (sorted keys) before hashing (ENG-607) Hash over sorted-key JSON so configs that differ only in dict key order hash equal -- identity tracks meaning, not formatting. Stored JSON used for reconstruction is unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/orcapod/pydantic_config.py | 15 +++++++++++---- tests/test_pydantic_config.py | 13 +++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/orcapod/pydantic_config.py b/src/orcapod/pydantic_config.py index c03b053a..8dac32d9 100644 --- a/src/orcapod/pydantic_config.py +++ b/src/orcapod/pydantic_config.py @@ -1,4 +1,4 @@ -"""Pydantic-backed config loading for orcapod pipelines (ENG-601 / ENG-607). +"""Pydantic-backed config loading for orcapod pipelines (ENG-607). Provides `load_pydantic_config` (validate a YAML file against a pydantic model) and `OrcapodBaseConfig` (a strict base for config schemas). A companion @@ -9,6 +9,7 @@ from __future__ import annotations import importlib +import json from pathlib import Path from typing import TYPE_CHECKING, Any, TypeVar @@ -100,8 +101,9 @@ class PydanticModelConverter(SemanticStructConverterBase): Maps any `pydantic.BaseModel` instance to an Arrow struct holding the model's fully-qualified class name and its canonical JSON, and back. Content - is hashed over (class name + canonical JSON), so identity tracks the config's - meaning rather than source-file formatting. Modeled on `PythonPathStructConverter`. + is hashed over (class name + sorted-key canonical JSON), so identity tracks + the config's meaning rather than source-file formatting or dict key order. + Modeled on `PythonPathStructConverter`. """ def __init__(self) -> None: @@ -173,6 +175,11 @@ def hash_struct_dict( raise ValueError( f"Missing '{_MODEL_FIELD}'/'{_JSON_FIELD}' in struct dict" ) - content = f"{qualified_name}\n{json_str}".encode("utf-8") + # Canonicalize (sorted keys) so semantically-equal configs that differ only + # in dict key order hash equal -- identity tracks meaning, not formatting. + canonical_json = json.dumps( + json.loads(json_str), sort_keys=True, separators=(",", ":") + ) + content = f"{qualified_name}\n{canonical_json}".encode("utf-8") content_hash = self._compute_content_hash(content) return self._format_hash_string(content_hash.digest, add_prefix=add_prefix) diff --git a/tests/test_pydantic_config.py b/tests/test_pydantic_config.py index d1effc91..26eeb0ce 100644 --- a/tests/test_pydantic_config.py +++ b/tests/test_pydantic_config.py @@ -19,6 +19,11 @@ class SampleConfig(OrcapodBaseConfig): retries: int = 3 +class DictConfig(OrcapodBaseConfig): + name: str + params: dict[str, int] + + def _write(tmp_path: Path, text: str) -> Path: p = tmp_path / "config.yaml" p.write_text(text, encoding="utf-8") @@ -174,6 +179,14 @@ def test_registered_in_default_context_roundtrip(): assert restored[0]["config"] == cfg +def test_hash_stable_across_dict_key_order(): + conv = _converter() + a = conv.python_to_struct_dict(DictConfig(name="x", params={"a": 1, "b": 2})) + b = conv.python_to_struct_dict(DictConfig(name="x", params={"b": 2, "a": 1})) + # Same contents, different insertion order -> must hash equal (meaning, not order). + assert conv.hash_struct_dict(a) == conv.hash_struct_dict(b) + + def test_default_context_hashes_model_stably(): ctx = get_default_context() converter = ctx.type_converter From c23fe264dee4f1ac94bd36b79cf00851c0e9eace Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Fri, 12 Jun 2026 23:37:11 +0000 Subject: [PATCH 12/12] feat(pydantic_config): support UPath for object-storage config files (ENG-607) load_pydantic_config now resolves the path through UPath and reads via read_text, so configs on s3://, gs://, etc. work in addition to local paths. Per PR review. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/orcapod/pydantic_config.py | 26 +++++++++++++++++--------- tests/test_pydantic_config.py | 10 ++++++++++ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/src/orcapod/pydantic_config.py b/src/orcapod/pydantic_config.py index 8dac32d9..6ecf21e6 100644 --- a/src/orcapod/pydantic_config.py +++ b/src/orcapod/pydantic_config.py @@ -15,6 +15,7 @@ import pydantic import yaml +from upath import UPath from orcapod.semantic_types.semantic_struct_converters import SemanticStructConverterBase from orcapod.utils.lazy_module import LazyModule @@ -38,29 +39,36 @@ class OrcapodBaseConfig(pydantic.BaseModel): model_config = pydantic.ConfigDict(extra="forbid", frozen=True) -def load_pydantic_config(path: str | Path, model_cls: type[M]) -> M: +def load_pydantic_config(path: str | Path | UPath, model_cls: type[M]) -> M: """Read a YAML file and validate it against a pydantic model. + The path is resolved through ``UPath``, so local paths and remote object + storage (e.g. ``s3://``, ``gs://``) are both supported. + Args: - path: Path to the YAML config file. + path: Path to the YAML config file. A local path or any ``UPath``-supported + URI (e.g. an object-storage location). model_cls: The pydantic model class to validate against. Returns: A validated instance of `model_cls`. Raises: - ValueError: If the YAML cannot be parsed or fails validation. The error - message includes the file path and the underlying field-level detail. + ValueError: If the file cannot be read, the YAML cannot be parsed, or + validation fails. The error message includes the file path and the + underlying detail. """ - path = Path(path) + path = UPath(path) try: - with open(path, "r", encoding="utf-8") as f: - data = yaml.safe_load(f) - except yaml.YAMLError as e: - raise ValueError(f"Could not parse YAML config {path}: {e}") from e + text = path.read_text(encoding="utf-8") except OSError as e: raise ValueError(f"Could not read YAML config {path}: {e}") from e + try: + data = yaml.safe_load(text) + except yaml.YAMLError as e: + raise ValueError(f"Could not parse YAML config {path}: {e}") from e + try: return model_cls.model_validate(data) except pydantic.ValidationError as e: diff --git a/tests/test_pydantic_config.py b/tests/test_pydantic_config.py index 26eeb0ce..75acbc62 100644 --- a/tests/test_pydantic_config.py +++ b/tests/test_pydantic_config.py @@ -7,6 +7,7 @@ import pyarrow as pa import pydantic import pytest +from upath import UPath from orcapod.contexts import get_default_context from orcapod.pydantic_config import OrcapodBaseConfig, PydanticModelConverter, load_pydantic_config @@ -39,6 +40,15 @@ def test_loads_valid_config(tmp_path): assert cfg.retries == 3 # default applied +def test_loads_via_upath(tmp_path): + # UPath of a local path exercises the object-storage-capable read path. + path = _write(tmp_path, "name: run1\nthreshold: 6.0\n") + cfg = load_pydantic_config(UPath(path), SampleConfig) + assert isinstance(cfg, SampleConfig) + assert cfg.name == "run1" + assert cfg.threshold == 6.0 + + def test_wrong_type_raises_with_path(tmp_path): path = _write(tmp_path, "name: run1\nthreshold: not-a-number\n") with pytest.raises(ValueError) as exc: