From 3be642a6fb4de6d1843223493e9c2cf9e7074f1d Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 10:46:00 +0000 Subject: [PATCH 01/33] docs(plt-1660): add design spec for hard cut to extension type hashing Covers: visitor visit_extension dispatch, SemanticHashingVisitor rewrite, StarfixArrowHasher constructor update, renames (BaseSemanticHasher -> SemanticAwarePythonHasher, TypeHandlerRegistry -> PythonTypeHandlerRegistry), v0.1.json / context_schema.json changes, and deletion plan for old SemanticTypeRegistry / SemanticStructConverter files. Co-Authored-By: Claude Sonnet 4.6 --- ...lt-1660-hard-cut-extension-type-hashing.md | 412 ++++++++++++++++++ 1 file changed, 412 insertions(+) create mode 100644 superpowers/specs/2026-06-24-plt-1660-hard-cut-extension-type-hashing.md diff --git a/superpowers/specs/2026-06-24-plt-1660-hard-cut-extension-type-hashing.md b/superpowers/specs/2026-06-24-plt-1660-hard-cut-extension-type-hashing.md new file mode 100644 index 00000000..726839ad --- /dev/null +++ b/superpowers/specs/2026-06-24-plt-1660-hard-cut-extension-type-hashing.md @@ -0,0 +1,412 @@ +# PLT-1660: Hard cut — delete old semantic type system and wire in extension type system + +**Date:** 2026-06-24 +**Issue:** PLT-1660 +**Branch:** `eywalker/plt-1660-hard-cut-delete-old-semantic-type-system-and-wire-in` +**Target:** `extension-type-system` + +--- + +## Overview + +The codebase currently has two parallel "semantic type" systems: + +1. **Old system** (shape-based identity): `SemanticTypeRegistry` / `SemanticStructConverterProtocol` — identifies extension + types by matching Arrow struct field signatures. Lives in `src/orcapod/semantic_types/`. +2. **New system** (extension type identity): `LogicalTypeRegistry` / `LogicalTypeProtocol` — identifies types by + `ARROW:extension:name` metadata embedded in the Arrow field. Lives in `src/orcapod/extension_types/`. + +The `UniversalTypeConverter` already uses only the new system. This issue performs a "hard cut": delete the old +system entirely and wire the new system into the remaining production call sites — primarily the Arrow hashing visitors. + +--- + +## Scope + +### In scope +- Rewrite `SemanticHashingVisitor` in `visitors.py` to dispatch on extension types instead of struct signatures +- Update `StarfixArrowHasher` (and `SemanticArrowHasher`) to accept `type_converter + semantic_hasher` instead of `semantic_registry` +- Rename `BaseSemanticHasher` → `SemanticAwarePythonHasher` +- Rename `TypeHandlerRegistry` → `PythonTypeHandlerRegistry`, `BuiltinTypeHandlerRegistry` → `BuiltinPythonTypeHandlerRegistry` +- Update `v0.1.json` to remove the `semantic_registry` component and update all cross-refs +- Update `context_schema.json` to match +- Delete `semantic_struct_converters.py`, `semantic_registry.py`, the `SemanticStructConverterProtocol` class, and the old semantic type test directory +- Update all imports and references across the codebase + +### Out of scope +- PLT-1798 (making `extension_name == logical_type_name` invariant explicit in code) +- Any changes to `UniversalTypeConverter` — already fully migrated + +--- + +## Design + +### 1. Extension-type dispatch in `ArrowTypeDataVisitor` + +**File:** `src/orcapod/hashing/visitors.py` + +Add `visit_extension` as a non-abstract method on the base class. Update `visit()` to check +`isinstance(arrow_type, pa.ExtensionType)` **before** the struct check, since extension types with +struct storage are otherwise swallowed by `visit_struct`. + +```python +def visit_extension( + self, extension_type: "pa.ExtensionType", storage_value: Any +) -> tuple["pa.DataType", Any]: + """Handle an Arrow extension type. + + Default implementation: passthrough (preserves extension name and storage value + unchanged so that the underlying StarfixArrowHasher / ArrowDigester sees the full + extension metadata when it receives the pre-processed table). + + Subclasses may override to convert recognised extension types to a hashed + binary value (pa.large_binary()). + """ + return extension_type, storage_value + +def visit(self, arrow_type: "pa.DataType", data: Any) -> tuple["pa.DataType", Any]: + # Extension types must be checked FIRST; a Path column has storage type + # large_string, and its field type is an ExtensionType wrapping that storage. + # If we checked is_struct first, extension types with struct storage would be + # incorrectly routed to visit_struct. + if isinstance(arrow_type, pa.ExtensionType): + new_type, new_data = self.visit_extension(arrow_type, data) + # Re-visit the result if visit_extension transformed it into a non-extension type. + # This allows future composability (e.g. a "list of extension type" handler that + # returns a pa.large_list(pa.large_binary()) from visit_extension) and avoids + # infinite recursion since we only re-enter when the type changed AND is no + # longer an extension type. + if new_type is not arrow_type and not isinstance(new_type, pa.ExtensionType): + return self.visit(new_type, new_data) + return new_type, new_data + if pa.types.is_struct(arrow_type): + return self.visit_struct(arrow_type, data) + elif pa.types.is_list(arrow_type) or pa.types.is_large_list(arrow_type): + return self.visit_list(arrow_type, data) + elif pa.types.is_fixed_size_list(arrow_type): + return self.visit_list(arrow_type, data) + elif pa.types.is_map(arrow_type): + return self.visit_map(arrow_type, data) + else: + return self.visit_primitive(arrow_type, data) +``` + +### 2. `SemanticHashingVisitor` rewrite + +**File:** `src/orcapod/hashing/visitors.py` + +The constructor changes from `(semantic_registry: SemanticTypeRegistry)` to +`(type_converter: UniversalTypeConverter, python_hasher: SemanticAwarePythonHasher)`. + +The core logic moves from `visit_struct` into `visit_extension`: + +```python +class SemanticHashingVisitor(ArrowTypeDataVisitor): + """Visitor that replaces extension-typed columns with their content hashes. + + For each Arrow column whose type is a ``pa.ExtensionType``: + 1. Look up the corresponding Python type via ``type_converter``. + 2. If the Python type has a handler registered in ``python_hasher``, convert + the storage value to a Python object and hash it, replacing the column + with a ``pa.large_binary()`` value of the form:: + + extension_name_bytes + b":" + content_hash.to_prefixed_digest() + + where ``content_hash.to_prefixed_digest()`` = ``method_bytes + b":" + digest``. + 3. If no handler is registered (or if ``type_converter`` does not know the + extension type), return the extension type and storage value unchanged. + The downstream ``StarfixArrowHasher`` / ``ArrowDigester`` will see the + full extension metadata intact and include it in the cross-language hash. + """ + + def __init__( + self, + type_converter: "UniversalTypeConverter", + python_hasher: "SemanticAwarePythonHasher", + ) -> None: + self._type_converter = type_converter + self._python_hasher = python_hasher + self._current_field_path: list[str] = [] + + def visit_extension( + self, extension_type: "pa.ExtensionType", storage_value: Any + ) -> tuple["pa.DataType", Any]: + if storage_value is None: + return extension_type, None + + # Resolve extension type → Python type. + python_type = self._type_converter.arrow_type_to_python_type(extension_type) + + # If the converter couldn't resolve to a concrete class, passthrough. + if python_type is Any or not isinstance(python_type, type): + return extension_type, storage_value + + # Only hash if the python hasher has a handler for this type. + if not self._python_hasher.type_handler_registry.has_handler(python_type): + return extension_type, storage_value + + # Convert storage value → Python object and hash it. + python_obj = self._type_converter.storage_to_python(storage_value, python_type) + content_hash = self._python_hasher.hash_object(python_obj) + + # Encode as binary: "::" + # extension_name identifies the logical type; the content_hash.to_prefixed_digest() + # encodes the method name + raw digest bytes (compatible with pa.large_binary() + # columns elsewhere in the codebase that use h.to_prefixed_digest()). + hash_bytes = ( + extension_type.extension_name.encode("ascii") + + b":" + + content_hash.to_prefixed_digest() + ) + return pa.large_binary(), hash_bytes + + def visit_struct(self, struct_type, data): + """Regular struct (no extension identity) — recurse into fields.""" + if data is None: + return struct_type, None + return self._visit_struct_fields(struct_type, data) + + def visit_list(self, list_type, data): + if data is None: + return list_type, None + self._current_field_path.append("[*]") + try: + return self._visit_list_elements(list_type, data) + finally: + self._current_field_path.pop() + + def visit_map(self, map_type, data): + return map_type, data + + def visit_primitive(self, primitive_type, data): + return primitive_type, data +``` + +**Passthrough invariant:** when `visit_extension` returns the original `(extension_type, storage_value)`, +the column's field type remains a `pa.ExtensionType`. `schema_cleaner.clean_schema_for_hashing` retains +all `ARROW:extension:*` metadata, so `ArrowDigester.hash_table(..., include_metadata=True)` will see the +full extension identity. This ensures that extension types without a registered Python handler are still +hashed in a type-aware way by the underlying starfix algorithm. + +### 3. `StarfixArrowHasher` constructor update + +**File:** `src/orcapod/hashing/arrow_hashers.py` + +```python +# Before +def __init__(self, semantic_registry: SemanticTypeRegistry, hasher_id: str) -> None: + self.semantic_registry = semantic_registry + +# After +def __init__( + self, + type_converter: "UniversalTypeConverter", + semantic_hasher: "SemanticAwarePythonHasher", + hasher_id: str, +) -> None: + self._type_converter = type_converter + self._semantic_hasher = semantic_hasher +``` + +`_process_table_columns` creates `SemanticHashingVisitor(self._type_converter, self._semantic_hasher)` instead of +`SemanticHashingVisitor(self.semantic_registry)`. + +The short-circuit in `_process_table_columns` that skips non-struct/non-list columns should be updated: extension +types at the top level of a column CAN need processing, so the check should also pass through when +`isinstance(field.type, pa.ExtensionType)` is True (skip the short-circuit, so the visitor can dispatch +`visit_extension`). + +### 4. `SemanticArrowHasher` (legacy hasher) + +**File:** `src/orcapod/hashing/arrow_hashers.py` + +`SemanticArrowHasher` predates `StarfixArrowHasher` and is not referenced in `v0.1.json`. Apply the same +constructor change (`semantic_registry` → `type_converter + semantic_hasher`) for consistency, or delete it +entirely if no tests depend on it. Preference: **delete** as part of the hard cut. + +### 5. Renames + +| Old name | New name | File | +|----------|----------|------| +| `BaseSemanticHasher` | `SemanticAwarePythonHasher` | `src/orcapod/hashing/semantic_hashing/semantic_hasher.py` | +| `TypeHandlerRegistry` | `PythonTypeHandlerRegistry` | `src/orcapod/hashing/semantic_hashing/type_handler_registry.py` | +| `BuiltinTypeHandlerRegistry` | `BuiltinPythonTypeHandlerRegistry` | `src/orcapod/hashing/semantic_hashing/type_handler_registry.py` | + +All references across the codebase (imports, JSON specs, tests, docs) must be updated in the same PR. + +Per the project's no-backward-compatibility policy: no re-export aliases or deprecation wrappers. + +### 6. `v0.1.json` changes + +**File:** `src/orcapod/contexts/data/v0.1.json` + +- Remove the `semantic_registry` top-level component entirely. +- In `arrow_hasher._config`, replace: + ```json + "semantic_registry": {"_ref": "semantic_registry"} + ``` + with: + ```json + "type_converter": {"_ref": "type_converter"}, + "semantic_hasher": {"_ref": "semantic_hasher"} + ``` +- Rename the `type_handler_registry` component key → `python_type_handler_registry`. + Update the `semantic_hasher._config` ref accordingly: + ```json + "type_handler_registry": {"_ref": "python_type_handler_registry"} + ``` +- Update `arrow_hasher._class` from `StarfixArrowHasher` (already correct) and verify `semantic_hasher._class` is updated to `SemanticAwarePythonHasher`. +- Update `type_handler_registry` (inside `_config`) class references: + `TypeHandlerRegistry` → `PythonTypeHandlerRegistry` + +Full updated component list in file order: +``` +file_hasher (unchanged) +semantic_registry ← DELETE +arrow_hasher (updated refs: type_converter + semantic_hasher) +type_converter (unchanged) +function_info_extractor(unchanged) +python_type_handler_registry ← renamed from type_handler_registry +semantic_hasher (class → SemanticAwarePythonHasher, ref updated) +``` + +### 7. `context_schema.json` changes + +**File:** `src/orcapod/contexts/data/schemas/context_schema.json` + +- Remove the `semantic_registry` property from `properties`. +- Rename `type_handler_registry` property to `python_type_handler_registry`. + +### 8. `DataContext` core + +**File:** `src/orcapod/contexts/core.py` + +`DataContext` is a dataclass with `type_converter`, `arrow_hasher`, and `semantic_hasher` fields. +The `type_handler_registry` is not a field on `DataContext` — it is an implementation detail of the +`semantic_hasher`. No changes needed to `core.py` for this issue. + +### 9. `versioned_hashers.py` + +**File:** `src/orcapod/hashing/versioned_hashers.py` + +Update `get_versioned_semantic_arrow_hasher()` to use the new constructor signature: +```python +from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher +# ... +hasher = StarfixArrowHasher( + hasher_id=hasher_id, + type_converter=type_converter, # UniversalTypeConverter from DataContext + semantic_hasher=semantic_hasher, # SemanticAwarePythonHasher from DataContext +) +``` + +Since `versioned_hashers.py` currently constructs its own `SemanticTypeRegistry` inline, this module +needs to source `type_converter` and `semantic_hasher` from the active `DataContext` instead. If no context +is available at call time, wire it from the default context. + +--- + +## Files to delete + +| File | Reason | +|------|--------| +| `src/orcapod/semantic_types/semantic_struct_converters.py` | Old shape-based converters (PythonPathStructConverter, UUIDStructConverter, UPathStructConverter) | +| `src/orcapod/semantic_types/semantic_registry.py` | Old SemanticTypeRegistry | +| `SemanticStructConverterProtocol` class in `src/orcapod/protocols/semantic_types_protocols.py` | Protocol for old converters | +| `tests/test_semantic_types/` (all 9 files) | Tests for the old system | + +After deletion, verify `src/orcapod/semantic_types/__init__.py` no longer re-exports deleted names. + +--- + +## Files to update (beyond the core changes) + +These files import from the deleted / renamed modules and must be updated: + +- `src/orcapod/hashing/__init__.py` — re-exports `SemanticArrowHasher` (if deleted) and `TypeHandlerRegistry` (renamed) +- `src/orcapod/hashing/versioned_hashers.py` — inline `SemanticTypeRegistry` construction, renamed hasher class +- `src/orcapod/contexts/registry.py` — constructs contexts from JSON; will pick up new class names automatically via `parse_objectspec` as long as the JSON is updated +- `src/orcapod/__init__.py` — any top-level re-exports +- `tests/test_hashing/` — update imports and any `SemanticTypeRegistry` references + +Run `grep -r "SemanticTypeRegistry\|semantic_registry\|SemanticStructConverter\|BaseSemanticHasher\|TypeHandlerRegistry\|BuiltinTypeHandlerRegistry" src/ tests/` after implementation to catch any remaining references. + +--- + +## Binary encoding format + +Hash values produced by `visit_extension` are stored as `pa.large_binary()` with the layout: + +``` + ":" +``` + +where `content_hash.to_prefixed_digest()` = `method.encode("ascii") + b":" + digest_bytes`. + +Full example for a `pathlib.Path` column hashed with SHA-256: +``` +b"orcapod.path:semantic_v0.1:\xab\xcd\xef..." +``` + +This is consistent with the pattern already used in `function_node.py`: +```python +self.data_context.arrow_hasher.hash_table(tag_with_hash).to_prefixed_digest() +``` + +--- + +## Extension type short-circuit fix + +In `StarfixArrowHasher._process_table_columns`, the current short-circuit bypasses the visitor for +non-struct/non-list columns: + +```python +if not ( + pa.types.is_struct(field.type) + or pa.types.is_list(field.type) + or ... +): + new_columns.append(table.column(i)) # skipped — no visitor call + ... + continue +``` + +Extension type columns whose storage type is `pa.large_string()` (e.g. `orcapod.path`) would be +short-circuited here. The fix: also skip the short-circuit when the field type is an extension type: + +```python +if not ( + isinstance(field.type, pa.ExtensionType) # ← add this + or pa.types.is_struct(field.type) + or pa.types.is_list(field.type) + or pa.types.is_large_list(field.type) + or pa.types.is_fixed_size_list(field.type) + or pa.types.is_map(field.type) +): + ... + continue +``` + +--- + +## Test strategy + +1. Existing tests in `tests/test_hashing/` must all pass after the rename and wiring changes. +2. `tests/test_extension_types/` round-trip tests verify the conversion chain; these should continue to pass. +3. The deleted `tests/test_semantic_types/` tests are replaced implicitly by the extension type integration + tests — no new test file is required unless a specific gap is identified. +4. Run: `uv run pytest tests/test_hashing/ tests/test_extension_types/ tests/test_core/ -x` + +--- + +## Implementation order + +1. Rename `BaseSemanticHasher` → `SemanticAwarePythonHasher` and `TypeHandlerRegistry` → `PythonTypeHandlerRegistry` (update all references). +2. Add `visit_extension` to `ArrowTypeDataVisitor`; update `visit()` dispatch. +3. Rewrite `SemanticHashingVisitor` constructor and `visit_extension` implementation. +4. Update `StarfixArrowHasher` constructor; update `_process_table_columns` short-circuit. +5. Update `v0.1.json` and `context_schema.json`. +6. Update `versioned_hashers.py`. +7. Delete old semantic type files and their tests. +8. Run grep sweep for stale references; fix any found. +9. Run full test suite. From 1f16a9356cf2449839e7203129b9f41954418a46 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 16:18:42 +0000 Subject: [PATCH 02/33] docs(plt-1660): update spec with protocol tightening and full renames Folds in: TypeHandlerProtocol.handle()->Any tightened to PythonTypeSemanticHasherProtocol.hash()->ContentHash; all builtin handlers renamed to *SemanticHasher and updated to return ContentHash directly; registry renamed to PythonTypeSemanticHasherRegistry with updated method names. Co-Authored-By: Claude Sonnet 4.6 --- ...lt-1660-hard-cut-extension-type-hashing.md | 489 ++++++++++++------ 1 file changed, 344 insertions(+), 145 deletions(-) diff --git a/superpowers/specs/2026-06-24-plt-1660-hard-cut-extension-type-hashing.md b/superpowers/specs/2026-06-24-plt-1660-hard-cut-extension-type-hashing.md index 726839ad..3bb0d215 100644 --- a/superpowers/specs/2026-06-24-plt-1660-hard-cut-extension-type-hashing.md +++ b/superpowers/specs/2026-06-24-plt-1660-hard-cut-extension-type-hashing.md @@ -11,13 +11,20 @@ The codebase currently has two parallel "semantic type" systems: -1. **Old system** (shape-based identity): `SemanticTypeRegistry` / `SemanticStructConverterProtocol` — identifies extension - types by matching Arrow struct field signatures. Lives in `src/orcapod/semantic_types/`. +1. **Old system** (shape-based identity): `SemanticTypeRegistry` / `SemanticStructConverterProtocol` — identifies + extension types by matching Arrow struct field signatures. Lives in `src/orcapod/semantic_types/`. 2. **New system** (extension type identity): `LogicalTypeRegistry` / `LogicalTypeProtocol` — identifies types by `ARROW:extension:name` metadata embedded in the Arrow field. Lives in `src/orcapod/extension_types/`. -The `UniversalTypeConverter` already uses only the new system. This issue performs a "hard cut": delete the old -system entirely and wire the new system into the remaining production call sites — primarily the Arrow hashing visitors. +`UniversalTypeConverter` already uses only the new system. This issue performs a "hard cut": delete the old +system entirely and wire the new system into the remaining production call sites — primarily the Arrow hashing +visitors. + +This issue also folds in a protocol tightening: `TypeHandlerProtocol.handle()` currently has a mixed return +type (`Any`) — some handlers return `ContentHash` directly (Path, ArrowTable), while others return intermediate +values (UUID returns `bytes`, BytesHandler returns `str`, etc.). Since all handlers receive the full hasher +reference and the only purpose of a handler is to produce a hash, the protocol is tightened so every handler +returns `ContentHash` directly. This makes the naming accurate and the interface uniform. --- @@ -25,12 +32,22 @@ system entirely and wire the new system into the remaining production call sites ### In scope - Rewrite `SemanticHashingVisitor` in `visitors.py` to dispatch on extension types instead of struct signatures -- Update `StarfixArrowHasher` (and `SemanticArrowHasher`) to accept `type_converter + semantic_hasher` instead of `semantic_registry` -- Rename `BaseSemanticHasher` → `SemanticAwarePythonHasher` -- Rename `TypeHandlerRegistry` → `PythonTypeHandlerRegistry`, `BuiltinTypeHandlerRegistry` → `BuiltinPythonTypeHandlerRegistry` -- Update `v0.1.json` to remove the `semantic_registry` component and update all cross-refs +- Update `StarfixArrowHasher` (and delete `SemanticArrowHasher`) to accept `type_converter + semantic_hasher` + instead of `semantic_registry` +- **Protocol tightening**: change `TypeHandlerProtocol.handle() -> Any` to + `PythonTypeSemanticHasherProtocol.hash() -> ContentHash`; update all builtin handlers accordingly +- **Renames** (full list in §Design §5): + - `BaseSemanticHasher` → `SemanticAwarePythonHasher` + - `TypeHandlerRegistry` → `PythonTypeSemanticHasherRegistry` + - `BuiltinTypeHandlerRegistry` → `BuiltinPythonTypeSemanticHasherRegistry` + - `TypeHandlerProtocol` → `PythonTypeSemanticHasherProtocol` + - All builtin handler classes renamed (e.g. `PathContentHandler` → `PathSemanticHasher`) + - `register_builtin_handlers` → `register_builtin_python_type_semantic_hashers` + - `get_default_type_handler_registry` → `get_default_python_type_semantic_hasher_registry` +- Update `v0.1.json` to remove `semantic_registry` component and update all class names / cross-refs - Update `context_schema.json` to match -- Delete `semantic_struct_converters.py`, `semantic_registry.py`, the `SemanticStructConverterProtocol` class, and the old semantic type test directory +- Delete `semantic_struct_converters.py`, `semantic_registry.py`, `SemanticStructConverterProtocol`, and + `tests/test_semantic_types/` - Update all imports and references across the codebase ### Out of scope @@ -46,8 +63,8 @@ system entirely and wire the new system into the remaining production call sites **File:** `src/orcapod/hashing/visitors.py` Add `visit_extension` as a non-abstract method on the base class. Update `visit()` to check -`isinstance(arrow_type, pa.ExtensionType)` **before** the struct check, since extension types with -struct storage are otherwise swallowed by `visit_struct`. +`isinstance(arrow_type, pa.ExtensionType)` **before** the struct check — otherwise extension types with +struct storage would be swallowed by `visit_struct`. ```python def visit_extension( @@ -55,27 +72,26 @@ def visit_extension( ) -> tuple["pa.DataType", Any]: """Handle an Arrow extension type. - Default implementation: passthrough (preserves extension name and storage value - unchanged so that the underlying StarfixArrowHasher / ArrowDigester sees the full - extension metadata when it receives the pre-processed table). + Default implementation: passthrough — preserves the extension type and its storage + value unchanged so that the downstream StarfixArrowHasher / ArrowDigester sees the + full extension metadata when it receives the pre-processed table. Subclasses may override to convert recognised extension types to a hashed - binary value (pa.large_binary()). + pa.large_binary() value. """ return extension_type, storage_value def visit(self, arrow_type: "pa.DataType", data: Any) -> tuple["pa.DataType", Any]: - # Extension types must be checked FIRST; a Path column has storage type + # Extension types must be checked FIRST. A Path column has storage type # large_string, and its field type is an ExtensionType wrapping that storage. - # If we checked is_struct first, extension types with struct storage would be - # incorrectly routed to visit_struct. + # Checking is_struct first would incorrectly route extension types with struct + # storage into visit_struct. if isinstance(arrow_type, pa.ExtensionType): new_type, new_data = self.visit_extension(arrow_type, data) - # Re-visit the result if visit_extension transformed it into a non-extension type. - # This allows future composability (e.g. a "list of extension type" handler that - # returns a pa.large_list(pa.large_binary()) from visit_extension) and avoids - # infinite recursion since we only re-enter when the type changed AND is no - # longer an extension type. + # Re-visit if visit_extension transformed to a non-extension type. + # This enables composability (e.g. a list-of-extension-type handler returning + # pa.large_list(pa.large_binary())) and avoids infinite recursion: we only + # re-enter when the type changed AND is no longer an extension type. if new_type is not arrow_type and not isinstance(new_type, pa.ExtensionType): return self.visit(new_type, new_data) return new_type, new_data @@ -95,10 +111,10 @@ def visit(self, arrow_type: "pa.DataType", data: Any) -> tuple["pa.DataType", An **File:** `src/orcapod/hashing/visitors.py` -The constructor changes from `(semantic_registry: SemanticTypeRegistry)` to +Constructor changes from `(semantic_registry: SemanticTypeRegistry)` to `(type_converter: UniversalTypeConverter, python_hasher: SemanticAwarePythonHasher)`. -The core logic moves from `visit_struct` into `visit_extension`: +Core logic moves from `visit_struct` into `visit_extension`: ```python class SemanticHashingVisitor(ArrowTypeDataVisitor): @@ -106,17 +122,17 @@ class SemanticHashingVisitor(ArrowTypeDataVisitor): For each Arrow column whose type is a ``pa.ExtensionType``: 1. Look up the corresponding Python type via ``type_converter``. - 2. If the Python type has a handler registered in ``python_hasher``, convert - the storage value to a Python object and hash it, replacing the column - with a ``pa.large_binary()`` value of the form:: + 2. If the Python type has a semantic hasher registered in ``python_hasher``, + convert the storage value to a Python object and hash it, replacing the + column with a ``pa.large_binary()`` value of the form:: extension_name_bytes + b":" + content_hash.to_prefixed_digest() where ``content_hash.to_prefixed_digest()`` = ``method_bytes + b":" + digest``. - 3. If no handler is registered (or if ``type_converter`` does not know the + 3. If no hasher is registered (or if ``type_converter`` does not know the extension type), return the extension type and storage value unchanged. The downstream ``StarfixArrowHasher`` / ``ArrowDigester`` will see the - full extension metadata intact and include it in the cross-language hash. + full extension metadata intact and hash it in a type-aware way. """ def __init__( @@ -141,8 +157,8 @@ class SemanticHashingVisitor(ArrowTypeDataVisitor): if python_type is Any or not isinstance(python_type, type): return extension_type, storage_value - # Only hash if the python hasher has a handler for this type. - if not self._python_hasher.type_handler_registry.has_handler(python_type): + # Only hash if the python hasher has a semantic hasher for this type. + if not self._python_hasher.type_semantic_hasher_registry.has_semantic_hasher(python_type): return extension_type, storage_value # Convert storage value → Python object and hash it. @@ -150,9 +166,6 @@ class SemanticHashingVisitor(ArrowTypeDataVisitor): content_hash = self._python_hasher.hash_object(python_obj) # Encode as binary: "::" - # extension_name identifies the logical type; the content_hash.to_prefixed_digest() - # encodes the method name + raw digest bytes (compatible with pa.large_binary() - # columns elsewhere in the codebase that use h.to_prefixed_digest()). hash_bytes = ( extension_type.extension_name.encode("ascii") + b":" @@ -184,9 +197,9 @@ class SemanticHashingVisitor(ArrowTypeDataVisitor): **Passthrough invariant:** when `visit_extension` returns the original `(extension_type, storage_value)`, the column's field type remains a `pa.ExtensionType`. `schema_cleaner.clean_schema_for_hashing` retains -all `ARROW:extension:*` metadata, so `ArrowDigester.hash_table(..., include_metadata=True)` will see the -full extension identity. This ensures that extension types without a registered Python handler are still -hashed in a type-aware way by the underlying starfix algorithm. +all `ARROW:extension:*` metadata, so `ArrowDigester.hash_table(..., include_metadata=True)` sees the full +extension identity. Extension types without a registered Python semantic hasher are still hashed in a +type-aware way by the underlying starfix algorithm. ### 3. `StarfixArrowHasher` constructor update @@ -195,7 +208,6 @@ hashed in a type-aware way by the underlying starfix algorithm. ```python # Before def __init__(self, semantic_registry: SemanticTypeRegistry, hasher_id: str) -> None: - self.semantic_registry = semantic_registry # After def __init__( @@ -206,37 +218,220 @@ def __init__( ) -> None: self._type_converter = type_converter self._semantic_hasher = semantic_hasher + self._hasher_id = hasher_id ``` -`_process_table_columns` creates `SemanticHashingVisitor(self._type_converter, self._semantic_hasher)` instead of -`SemanticHashingVisitor(self.semantic_registry)`. +`_process_table_columns` constructs `SemanticHashingVisitor(self._type_converter, self._semantic_hasher)` +instead of `SemanticHashingVisitor(self.semantic_registry)`. + +The short-circuit in `_process_table_columns` that skips non-struct/non-list columns must also allow +extension type columns through — otherwise Path columns (storage: `large_string`) would be silently skipped +before the visitor sees them: -The short-circuit in `_process_table_columns` that skips non-struct/non-list columns should be updated: extension -types at the top level of a column CAN need processing, so the check should also pass through when -`isinstance(field.type, pa.ExtensionType)` is True (skip the short-circuit, so the visitor can dispatch -`visit_extension`). +```python +if not ( + isinstance(field.type, pa.ExtensionType) # ← add this + or pa.types.is_struct(field.type) + or pa.types.is_list(field.type) + or pa.types.is_large_list(field.type) + or pa.types.is_fixed_size_list(field.type) + or pa.types.is_map(field.type) +): + new_columns.append(table.column(i)) + new_fields.append(field) + continue +``` ### 4. `SemanticArrowHasher` (legacy hasher) **File:** `src/orcapod/hashing/arrow_hashers.py` -`SemanticArrowHasher` predates `StarfixArrowHasher` and is not referenced in `v0.1.json`. Apply the same -constructor change (`semantic_registry` → `type_converter + semantic_hasher`) for consistency, or delete it -entirely if no tests depend on it. Preference: **delete** as part of the hard cut. +`SemanticArrowHasher` predates `StarfixArrowHasher` and is not referenced in `v0.1.json`. **Delete** it as +part of the hard cut. If any test depends on it directly, delete the test — these tests are superseded by the +extension type integration tests. ### 5. Renames +#### Classes and protocols + | Old name | New name | File | |----------|----------|------| -| `BaseSemanticHasher` | `SemanticAwarePythonHasher` | `src/orcapod/hashing/semantic_hashing/semantic_hasher.py` | -| `TypeHandlerRegistry` | `PythonTypeHandlerRegistry` | `src/orcapod/hashing/semantic_hashing/type_handler_registry.py` | -| `BuiltinTypeHandlerRegistry` | `BuiltinPythonTypeHandlerRegistry` | `src/orcapod/hashing/semantic_hashing/type_handler_registry.py` | +| `BaseSemanticHasher` | `SemanticAwarePythonHasher` | `semantic_hashing/semantic_hasher.py` | +| `TypeHandlerRegistry` | `PythonTypeSemanticHasherRegistry` | `semantic_hashing/type_handler_registry.py` | +| `BuiltinTypeHandlerRegistry` | `BuiltinPythonTypeSemanticHasherRegistry` | `semantic_hashing/type_handler_registry.py` | +| `TypeHandlerProtocol` | `PythonTypeSemanticHasherProtocol` | `protocols/hashing_protocols.py` | + +#### Builtin handler classes (in `semantic_hashing/builtin_handlers.py`) + +| Old name | New name | +|----------|----------| +| `PathContentHandler` | `PathSemanticHasher` | +| `UPathContentHandler` | `UPathSemanticHasher` | +| `UUIDHandler` | `UUIDSemanticHasher` | +| `BytesHandler` | `BytesSemanticHasher` | +| `FunctionHandler` | `FunctionSemanticHasher` | +| `TypeObjectHandler` | `TypeObjectSemanticHasher` | +| `SpecialFormHandler` | `SpecialFormSemanticHasher` | +| `GenericAliasHandler` | `GenericAliasSemanticHasher` | +| `UnionTypeHandler` | `UnionTypeSemanticHasher` | +| `ArrowTableHandler` | `ArrowTableSemanticHasher` | +| `SchemaHandler` | `SchemaSemanticHasher` | + +#### Functions and properties + +| Old name | New name | Location | +|----------|----------|----------| +| `register_builtin_handlers(registry)` | `register_builtin_python_type_semantic_hashers(registry)` | `builtin_handlers.py` | +| `get_default_type_handler_registry()` | `get_default_python_type_semantic_hasher_registry()` | `type_handler_registry.py` and `defaults.py` | +| `BaseSemanticHasher.type_handler_registry` property | `SemanticAwarePythonHasher.type_semantic_hasher_registry` | `semantic_hasher.py` | + +#### Registry methods + +| Old name | New name | +|----------|----------| +| `get_handler(obj)` | `get_semantic_hasher(obj)` | +| `get_handler_for_type(target_type)` | `get_semantic_hasher_for_type(target_type)` | +| `has_handler(target_type)` | `has_semantic_hasher(target_type)` | + +The `register(target_type, handler)` method name is unchanged — "register" is generic enough. All references across the codebase (imports, JSON specs, tests, docs) must be updated in the same PR. - Per the project's no-backward-compatibility policy: no re-export aliases or deprecation wrappers. -### 6. `v0.1.json` changes +### 6. Protocol tightening — `PythonTypeSemanticHasherProtocol` + +**File:** `src/orcapod/protocols/hashing_protocols.py` + +The `handle(obj, hasher) -> Any` method is replaced by `hash(obj, hasher) -> ContentHash`: + +```python +class PythonTypeSemanticHasherProtocol(Protocol): + """Protocol for type-specific semantic hashers used by SemanticAwarePythonHasher. + + A PythonTypeSemanticHasherProtocol hashes a specific Python type to a ContentHash. + Implementations are registered with a PythonTypeSemanticHasherRegistry and looked + up via MRO-aware resolution. + + Each implementation receives the full SemanticAwarePythonHasher so it can delegate + hashing of sub-values (e.g. hashing a dict of function metadata) back to the outer + hasher without coupling to a specific hasher instance. + """ + + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + """Hash *obj* to a ContentHash. + + Args: + obj: The object to hash. Always matches the registered type. + hasher: The active SemanticAwarePythonHasher. Use + ``hasher.hash_object(sub_value)`` to hash sub-values. + + Returns: + ContentHash: The content-addressed hash of *obj*. + """ + ... +``` + +#### `hash_object()` simplification + +Because every semantic hasher now returns `ContentHash` directly, the dispatch in `hash_object()` simplifies +from a double call to a single call: + +```python +# Before +semantic_hasher = self._registry.get_semantic_hasher(obj) +if semantic_hasher is not None: + return self.hash_object(semantic_hasher.handle(obj, self), resolver=resolver) + # ^^^ recursive wrap ^^^ + +# After +semantic_hasher = self._registry.get_semantic_hasher(obj) +if semantic_hasher is not None: + return semantic_hasher.hash(obj, self) # always ContentHash — no wrap +``` + +#### Updated builtin implementations + +Each builtin class returns `ContentHash` directly by delegating sub-values back to `hasher.hash_object()`: + +```python +class PathSemanticHasher: + def __init__(self, file_hasher: FileContentHasherProtocol) -> None: + self.file_hasher = file_hasher + + def hash(self, obj: PathLike, hasher: SemanticAwarePythonHasher) -> ContentHash: + path = Path(obj) + # (existence / is_dir checks unchanged) + return self.file_hasher.hash_file(path) # already returns ContentHash + + +class UUIDSemanticHasher: + def hash(self, obj: Any, hasher: SemanticAwarePythonHasher) -> ContentHash: + return hasher.hash_object(obj.bytes) # bytes → ContentHash via hasher + + +class BytesSemanticHasher: + def hash(self, obj: Any, hasher: SemanticAwarePythonHasher) -> ContentHash: + if isinstance(obj, (bytes, bytearray)): + return hasher.hash_object(obj.hex()) # hex str → ContentHash via hasher + raise TypeError(...) + + +class FunctionSemanticHasher: + def __init__(self, function_info_extractor: Any) -> None: + self.function_info_extractor = function_info_extractor + + def hash(self, obj: Any, hasher: SemanticAwarePythonHasher) -> ContentHash: + info = self.function_info_extractor.extract_function_info(obj) + return hasher.hash_object(info) # dict → ContentHash via hasher + + +class TypeObjectSemanticHasher: + def hash(self, obj: Any, hasher: SemanticAwarePythonHasher) -> ContentHash: + module = obj.__module__ or "" + return hasher.hash_object(f"type:{module}.{obj.__qualname__}") + + +class ArrowTableSemanticHasher: + def __init__(self, arrow_hasher: ArrowHasherProtocol) -> None: + self.arrow_hasher = arrow_hasher + + def hash(self, obj: Any, hasher: SemanticAwarePythonHasher) -> ContentHash: + if isinstance(obj, pa.RecordBatch): + obj = pa.Table.from_batches([obj]) + return self.arrow_hasher.hash_table(obj) # already returns ContentHash + + +class SpecialFormSemanticHasher: + def hash(self, obj: Any, hasher: SemanticAwarePythonHasher) -> ContentHash: + name = getattr(obj, "_name", None) or repr(obj) + return hasher.hash_object(f"special_form:typing.{name}") + + +class GenericAliasSemanticHasher: + def hash(self, obj: Any, hasher: SemanticAwarePythonHasher) -> ContentHash: + import typing + origin = getattr(obj, "__origin__", None) + args = getattr(obj, "__args__", None) or () + if origin is None: + return hasher.hash_object(f"generic_alias:{obj!r}") + if origin is typing.Union: + hashed_args = sorted(hasher.hash_object(arg).to_string() for arg in args) + return hasher.hash_object({"__type__": "union", "args": hashed_args}) + return hasher.hash_object({ + "__type__": "generic_alias", + "origin": hasher.hash_object(origin).to_string(), + "args": [hasher.hash_object(arg).to_string() for arg in args], + }) + + +class UnionTypeSemanticHasher: + def hash(self, obj: Any, hasher: SemanticAwarePythonHasher) -> ContentHash: + args = getattr(obj, "__args__", None) or () + hashed_args = sorted(hasher.hash_object(arg).to_string() for arg in args) + return hasher.hash_object({"__type__": "union", "args": hashed_args}) +``` + +### 7. `v0.1.json` changes **File:** `src/orcapod/contexts/data/v0.1.json` @@ -250,59 +445,70 @@ Per the project's no-backward-compatibility policy: no re-export aliases or depr "type_converter": {"_ref": "type_converter"}, "semantic_hasher": {"_ref": "semantic_hasher"} ``` -- Rename the `type_handler_registry` component key → `python_type_handler_registry`. - Update the `semantic_hasher._config` ref accordingly: +- Rename component key `type_handler_registry` → `python_type_semantic_hasher_registry`. +- Update `semantic_hasher._config` ref: ```json - "type_handler_registry": {"_ref": "python_type_handler_registry"} + "type_handler_registry": {"_ref": "python_type_semantic_hasher_registry"} ``` -- Update `arrow_hasher._class` from `StarfixArrowHasher` (already correct) and verify `semantic_hasher._class` is updated to `SemanticAwarePythonHasher`. -- Update `type_handler_registry` (inside `_config`) class references: - `TypeHandlerRegistry` → `PythonTypeHandlerRegistry` +- Update `semantic_hasher._class`: + `orcapod.hashing.semantic_hashing.semantic_hasher.BaseSemanticHasher` + → `orcapod.hashing.semantic_hashing.semantic_hasher.SemanticAwarePythonHasher` +- Update `python_type_semantic_hasher_registry._class`: + `orcapod.hashing.semantic_hashing.type_handler_registry.TypeHandlerRegistry` + → `orcapod.hashing.semantic_hashing.type_handler_registry.PythonTypeSemanticHasherRegistry` +- Update all handler `_class` entries in `python_type_semantic_hasher_registry._config.handlers` + to use the new class names (e.g. `PathContentHandler` → `PathSemanticHasher`, etc.) Full updated component list in file order: ``` -file_hasher (unchanged) -semantic_registry ← DELETE -arrow_hasher (updated refs: type_converter + semantic_hasher) -type_converter (unchanged) -function_info_extractor(unchanged) -python_type_handler_registry ← renamed from type_handler_registry -semantic_hasher (class → SemanticAwarePythonHasher, ref updated) +file_hasher (unchanged) +semantic_registry ← DELETE +arrow_hasher (class unchanged; _config: + type_converter ref, + semantic_hasher ref, - semantic_registry ref) +type_converter (unchanged) +function_info_extractor (unchanged) +python_type_semantic_hasher_registry ← renamed from type_handler_registry; class + handler entries updated +semantic_hasher (class → SemanticAwarePythonHasher; ref updated) ``` -### 7. `context_schema.json` changes +### 8. `context_schema.json` changes **File:** `src/orcapod/contexts/data/schemas/context_schema.json` - Remove the `semantic_registry` property from `properties`. -- Rename `type_handler_registry` property to `python_type_handler_registry`. +- Rename `type_handler_registry` property to `python_type_semantic_hasher_registry`. -### 8. `DataContext` core +### 9. `DataContext` core **File:** `src/orcapod/contexts/core.py` `DataContext` is a dataclass with `type_converter`, `arrow_hasher`, and `semantic_hasher` fields. -The `type_handler_registry` is not a field on `DataContext` — it is an implementation detail of the -`semantic_hasher`. No changes needed to `core.py` for this issue. +`type_handler_registry` is not a field on `DataContext` — it is an implementation detail of `semantic_hasher`. +No changes needed to `core.py`. -### 9. `versioned_hashers.py` +### 10. `versioned_hashers.py` **File:** `src/orcapod/hashing/versioned_hashers.py` -Update `get_versioned_semantic_arrow_hasher()` to use the new constructor signature: +Update `get_versioned_semantic_arrow_hasher()`: +- Remove inline `SemanticTypeRegistry` / `PythonPathStructConverter` / `UUIDStructConverter` construction. +- Source `type_converter` and `semantic_hasher` from the default `DataContext`: + ```python -from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher -# ... -hasher = StarfixArrowHasher( - hasher_id=hasher_id, - type_converter=type_converter, # UniversalTypeConverter from DataContext - semantic_hasher=semantic_hasher, # SemanticAwarePythonHasher from DataContext -) +def get_versioned_semantic_arrow_hasher( + hasher_id: str = _CURRENT_ARROW_HASHER_ID, +) -> hp.ArrowHasherProtocol: + from orcapod.hashing.arrow_hashers import StarfixArrowHasher + from orcapod.contexts import resolve_context + + ctx = resolve_context(None) # default context + return StarfixArrowHasher( + hasher_id=hasher_id, + type_converter=ctx.type_converter, + semantic_hasher=ctx.semantic_hasher, + ) ``` -Since `versioned_hashers.py` currently constructs its own `SemanticTypeRegistry` inline, this module -needs to source `type_converter` and `semantic_hasher` from the active `DataContext` instead. If no context -is available at call time, wire it from the default context. +Update `get_versioned_semantic_hasher()` to import `SemanticAwarePythonHasher` instead of `BaseSemanticHasher`. --- @@ -310,10 +516,10 @@ is available at call time, wire it from the default context. | File | Reason | |------|--------| -| `src/orcapod/semantic_types/semantic_struct_converters.py` | Old shape-based converters (PythonPathStructConverter, UUIDStructConverter, UPathStructConverter) | -| `src/orcapod/semantic_types/semantic_registry.py` | Old SemanticTypeRegistry | -| `SemanticStructConverterProtocol` class in `src/orcapod/protocols/semantic_types_protocols.py` | Protocol for old converters | -| `tests/test_semantic_types/` (all 9 files) | Tests for the old system | +| `src/orcapod/semantic_types/semantic_struct_converters.py` | Old shape-based converters | +| `src/orcapod/semantic_types/semantic_registry.py` | Old `SemanticTypeRegistry` | +| `SemanticStructConverterProtocol` class in `src/orcapod/protocols/semantic_types_protocols.py` | Protocol for old system | +| `tests/test_semantic_types/` (all 9 files) | Tests for old system | After deletion, verify `src/orcapod/semantic_types/__init__.py` no longer re-exports deleted names. @@ -321,15 +527,28 @@ After deletion, verify `src/orcapod/semantic_types/__init__.py` no longer re-exp ## Files to update (beyond the core changes) -These files import from the deleted / renamed modules and must be updated: - -- `src/orcapod/hashing/__init__.py` — re-exports `SemanticArrowHasher` (if deleted) and `TypeHandlerRegistry` (renamed) -- `src/orcapod/hashing/versioned_hashers.py` — inline `SemanticTypeRegistry` construction, renamed hasher class -- `src/orcapod/contexts/registry.py` — constructs contexts from JSON; will pick up new class names automatically via `parse_objectspec` as long as the JSON is updated -- `src/orcapod/__init__.py` — any top-level re-exports -- `tests/test_hashing/` — update imports and any `SemanticTypeRegistry` references - -Run `grep -r "SemanticTypeRegistry\|semantic_registry\|SemanticStructConverter\|BaseSemanticHasher\|TypeHandlerRegistry\|BuiltinTypeHandlerRegistry" src/ tests/` after implementation to catch any remaining references. +These files import from the deleted or renamed modules and must be updated: + +- `src/orcapod/hashing/__init__.py` — re-exports `BaseSemanticHasher`, `TypeHandlerRegistry`, `TypeHandlerProtocol` +- `src/orcapod/hashing/semantic_hashing/__init__.py` — re-exports all renamed classes +- `src/orcapod/hashing/defaults.py` — `get_default_type_handler_registry` → `get_default_python_type_semantic_hasher_registry` +- `src/orcapod/hashing/semantic_hashing/content_identifiable_mixin.py` — references `BaseSemanticHasher` +- `src/orcapod/hashing/versioned_hashers.py` — inline registry construction, old class names +- `src/orcapod/protocols/hashing_protocols.py` — `TypeHandlerProtocol` docstring references +- `src/orcapod/contexts/core.py` — `TYPE_CHECKING` import of `BaseSemanticHasher` (if any) +- `tests/test_hashing/` — update imports and any direct registry/handler references + +Run this sweep after implementation to catch any remaining references: + +```bash +grep -rn "SemanticTypeRegistry\|semantic_registry\|SemanticStructConverter\ +\|BaseSemanticHasher\|TypeHandlerRegistry\|BuiltinTypeHandlerRegistry\ +\|TypeHandlerProtocol\|PathContentHandler\|UPathContentHandler\ +\|UUIDHandler\|BytesHandler\|FunctionHandler\|TypeObjectHandler\ +\|SpecialFormHandler\|GenericAliasHandler\|UnionTypeHandler\|ArrowTableHandler\ +\|SchemaHandler\|register_builtin_handlers\|get_default_type_handler_registry\ +\|type_handler_registry\|get_handler\|has_handler" src/ tests/ +``` --- @@ -343,70 +562,50 @@ Hash values produced by `visit_extension` are stored as `pa.large_binary()` with where `content_hash.to_prefixed_digest()` = `method.encode("ascii") + b":" + digest_bytes`. -Full example for a `pathlib.Path` column hashed with SHA-256: +Full example for a `pathlib.Path` column whose file is hashed with SHA-256 by the semantic hasher: ``` b"orcapod.path:semantic_v0.1:\xab\xcd\xef..." + ^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^ + hasher_id raw SHA-256 digest ``` -This is consistent with the pattern already used in `function_node.py`: +This is consistent with the existing pattern in `function_node.py`: ```python self.data_context.arrow_hasher.hash_table(tag_with_hash).to_prefixed_digest() ``` --- -## Extension type short-circuit fix - -In `StarfixArrowHasher._process_table_columns`, the current short-circuit bypasses the visitor for -non-struct/non-list columns: - -```python -if not ( - pa.types.is_struct(field.type) - or pa.types.is_list(field.type) - or ... -): - new_columns.append(table.column(i)) # skipped — no visitor call - ... - continue -``` - -Extension type columns whose storage type is `pa.large_string()` (e.g. `orcapod.path`) would be -short-circuited here. The fix: also skip the short-circuit when the field type is an extension type: - -```python -if not ( - isinstance(field.type, pa.ExtensionType) # ← add this - or pa.types.is_struct(field.type) - or pa.types.is_list(field.type) - or pa.types.is_large_list(field.type) - or pa.types.is_fixed_size_list(field.type) - or pa.types.is_map(field.type) -): - ... - continue -``` - ---- - ## Test strategy -1. Existing tests in `tests/test_hashing/` must all pass after the rename and wiring changes. -2. `tests/test_extension_types/` round-trip tests verify the conversion chain; these should continue to pass. -3. The deleted `tests/test_semantic_types/` tests are replaced implicitly by the extension type integration - tests — no new test file is required unless a specific gap is identified. +1. Existing tests in `tests/test_hashing/` must all pass after renames, protocol changes, and wiring. +2. `tests/test_extension_types/` round-trip tests verify the full conversion chain; these must pass. +3. The deleted `tests/test_semantic_types/` tests are superseded by the extension type integration tests. 4. Run: `uv run pytest tests/test_hashing/ tests/test_extension_types/ tests/test_core/ -x` --- ## Implementation order -1. Rename `BaseSemanticHasher` → `SemanticAwarePythonHasher` and `TypeHandlerRegistry` → `PythonTypeHandlerRegistry` (update all references). -2. Add `visit_extension` to `ArrowTypeDataVisitor`; update `visit()` dispatch. -3. Rewrite `SemanticHashingVisitor` constructor and `visit_extension` implementation. -4. Update `StarfixArrowHasher` constructor; update `_process_table_columns` short-circuit. -5. Update `v0.1.json` and `context_schema.json`. -6. Update `versioned_hashers.py`. -7. Delete old semantic type files and their tests. -8. Run grep sweep for stale references; fix any found. -9. Run full test suite. +1. **Rename `TypeHandlerProtocol` → `PythonTypeSemanticHasherProtocol`**, change `handle() -> Any` to + `hash() -> ContentHash` in `protocols/hashing_protocols.py`. Update docstring. +2. **Rename `TypeHandlerRegistry` → `PythonTypeSemanticHasherRegistry`**, rename all registry methods + (`get_handler` → `get_semantic_hasher`, `has_handler` → `has_semantic_hasher`, etc.), + rename `BuiltinTypeHandlerRegistry` → `BuiltinPythonTypeSemanticHasherRegistry`. +3. **Update all builtin handler classes** in `builtin_handlers.py`: rename each class, change `handle()` → + `hash()`, update return type from `Any` → `ContentHash`, update implementations to return `ContentHash` + directly. Rename `register_builtin_handlers` → `register_builtin_python_type_semantic_hashers`. +4. **Rename `BaseSemanticHasher` → `SemanticAwarePythonHasher`** in `semantic_hasher.py`: simplify + `hash_object()` dispatch (remove double-wrap), rename `type_handler_registry` property → + `type_semantic_hasher_registry`, rename `get_default_type_handler_registry` → + `get_default_python_type_semantic_hasher_registry`. +5. **Update `__init__.py` exports** in `hashing/` and `hashing/semantic_hashing/` to use new names. +6. **Add `visit_extension` to `ArrowTypeDataVisitor`**; update `visit()` dispatch. +7. **Rewrite `SemanticHashingVisitor`** constructor and `visit_extension` implementation. +8. **Update `StarfixArrowHasher`**: new constructor signature, `_process_table_columns` short-circuit fix, + delete `SemanticArrowHasher`. +9. **Update `v0.1.json`** and **`context_schema.json`**. +10. **Update `versioned_hashers.py`** to source from `DataContext`. +11. **Delete** old semantic type files and their tests. +12. **Run grep sweep** for stale references; fix any found. +13. **Run full test suite**: `uv run pytest tests/test_hashing/ tests/test_extension_types/ tests/test_core/ -x` From f432895759f9962b991eaacaf4f36910761f9fcc Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 16:46:57 +0000 Subject: [PATCH 03/33] docs(plt-1660): update binary encoding format to use "::" separator and colon namespace Extension name dots replaced with colons (orcapod.path -> orcapod:path); "::" used as separator between type prefix and hash so the boundary is unambiguous (to_prefixed_digest uses only single ":"). Co-Authored-By: Claude Sonnet 4.6 --- ...lt-1660-hard-cut-extension-type-hashing.md | 39 +++++++++++++------ 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/superpowers/specs/2026-06-24-plt-1660-hard-cut-extension-type-hashing.md b/superpowers/specs/2026-06-24-plt-1660-hard-cut-extension-type-hashing.md index 3bb0d215..f6f5b009 100644 --- a/superpowers/specs/2026-06-24-plt-1660-hard-cut-extension-type-hashing.md +++ b/superpowers/specs/2026-06-24-plt-1660-hard-cut-extension-type-hashing.md @@ -126,9 +126,13 @@ class SemanticHashingVisitor(ArrowTypeDataVisitor): convert the storage value to a Python object and hash it, replacing the column with a ``pa.large_binary()`` value of the form:: - extension_name_bytes + b":" + content_hash.to_prefixed_digest() + type_name_bytes + b"::" + content_hash.to_prefixed_digest() - where ``content_hash.to_prefixed_digest()`` = ``method_bytes + b":" + digest``. + where ``type_name`` is the extension name with dots replaced by colons + (e.g. ``"orcapod.path"`` → ``"orcapod:path"``), and + ``content_hash.to_prefixed_digest()`` = ``method_bytes + b":" + digest``. + The ``::`` separator is unambiguous because ``to_prefixed_digest()`` only + uses single ``:``. Splitting on ``b"::"`` recovers both parts cleanly. 3. If no hasher is registered (or if ``type_converter`` does not know the extension type), return the extension type and storage value unchanged. The downstream ``StarfixArrowHasher`` / ``ArrowDigester`` will see the @@ -165,10 +169,14 @@ class SemanticHashingVisitor(ArrowTypeDataVisitor): python_obj = self._type_converter.storage_to_python(storage_value, python_type) content_hash = self._python_hasher.hash_object(python_obj) - # Encode as binary: "::" + # Encode as binary: ":::" + # Dots in the extension name are replaced with colons so the type prefix + # uses a consistent namespace separator (e.g. "orcapod:path"). + # The "::" separator is unambiguous — to_prefixed_digest() only uses ":". + type_name = extension_type.extension_name.replace(".", ":") hash_bytes = ( - extension_type.extension_name.encode("ascii") - + b":" + type_name.encode("ascii") + + b"::" + content_hash.to_prefixed_digest() ) return pa.large_binary(), hash_bytes @@ -557,18 +565,27 @@ grep -rn "SemanticTypeRegistry\|semantic_registry\|SemanticStructConverter\ Hash values produced by `visit_extension` are stored as `pa.large_binary()` with the layout: ``` - ":" + "::" ``` -where `content_hash.to_prefixed_digest()` = `method.encode("ascii") + b":" + digest_bytes`. +where: +- `type_name` = `extension_type.extension_name.replace(".", ":")` — dots in the Arrow extension + name are replaced with colons so the prefix uses a uniform namespace separator + (e.g. `"orcapod.path"` → `"orcapod:path"`, `"my.module.MyClass"` → `"my:module:MyClass"`) +- `"::"` is the separator between type prefix and hash — unambiguous because + `to_prefixed_digest()` only uses single `":"` +- `content_hash.to_prefixed_digest()` = `method.encode("ascii") + b":" + digest_bytes` -Full example for a `pathlib.Path` column whose file is hashed with SHA-256 by the semantic hasher: +Full example for a `pathlib.Path` column whose file is hashed by the semantic hasher: ``` -b"orcapod.path:semantic_v0.1:\xab\xcd\xef..." - ^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^ - hasher_id raw SHA-256 digest +b"orcapod:path::semantic_v0.1:\xab\xcd\xef..." + ^^^^^^^^^^^ ^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^ + type prefix hasher_id raw SHA-256 digest + (dots→colons) ``` +Parsing: `value.split(b"::", 1)` → `(b"orcapod:path", b"semantic_v0.1:\xab...")`. + This is consistent with the existing pattern in `function_node.py`: ```python self.data_context.arrow_hasher.hash_table(tag_with_hash).to_prefixed_digest() From 450dde89229102232b0f1f5a56cc34b558680f41 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 17:50:19 +0000 Subject: [PATCH 04/33] =?UTF-8?q?refactor(hashing=5Fprotocols):=20rename?= =?UTF-8?q?=20TypeHandlerProtocol=20=E2=86=92=20PythonTypeSemanticHasherPr?= =?UTF-8?q?otocol,=20tighten=20hash()=20=E2=86=92=20ContentHash?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/orcapod/protocols/hashing_protocols.py | 192 ++++----------------- 1 file changed, 31 insertions(+), 161 deletions(-) diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index e824211a..cee17709 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -9,7 +9,7 @@ if TYPE_CHECKING: import pyarrow as pa - from orcapod.hashing.semantic_hashing.type_handler_registry import TypeHandlerRegistry + from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeSemanticHasherRegistry @runtime_checkable @@ -18,208 +18,81 @@ class DataContextAwareProtocol(Protocol): @property def data_context_key(self) -> str: - """ - Return the data context key associated with this object. - - Returns: - str: The data context key - """ + """Return the data context key associated with this object.""" ... @runtime_checkable class PipelineElementProtocol(Protocol): - """ - Protocol for objects that have a stable identity as an element in a - pipeline graph — determined by schema and upstream topology, not by - data content. - - This is a parallel identity chain to ContentIdentifiableProtocol. - Where content identity captures the precise, data-inclusive identity of - an object, pipeline identity captures only what is structurally meaningful - for pipeline database path scoping: the schemas and the recursive topology - of the upstream computation. - - The base case (RootSource) returns a hash of (tag_schema, data_schema). - Every other element recurses through the pipeline_hash() of its upstream - inputs, with the hash values themselves (ContentHash objects) used as - terminal leaves so no special hasher mode is required. - - Two sources with identical schemas processed through the same function pod - graph will produce the same pipeline_hash() at every downstream node, - enabling automatic multi-source table sharing in the pipeline database. - """ + """Protocol for objects that have a stable identity as an element in a pipeline graph.""" def pipeline_identity_structure(self) -> Any: - """ - Return a structure representing this element's pipeline identity. - - At source nodes (base case): return (tag_schema, data_schema). - At all other nodes: return a structure containing references to - upstream pipeline elements and/or data functions as raw objects. - The pipeline resolver threaded through pipeline_hash() ensures that - PipelineElementProtocol objects are resolved via pipeline_hash() and - other ContentIdentifiable objects via content_hash(), both using the - same hasher throughout the computation. - """ + """Return a structure representing this element's pipeline identity.""" ... def pipeline_hash(self, hasher=None) -> ContentHash: - """ - Return the pipeline-level hash of this element, computed from - pipeline_identity_structure() and cached by hasher_id. - - Args: - hasher: Optional semantic hasher to use. When omitted, resolved - from the element's data_context. - """ + """Return the pipeline-level hash of this element.""" ... @runtime_checkable class ContentIdentifiableProtocol(Protocol): - """ - Protocol for objects that can express their semantic identity as a plain - Python structure. - - This is the only method a class needs to implement to participate in the - content-based hashing system. The returned structure is recursively - resolved by the SemanticHasherProtocol -- any nested ContentIdentifiableProtocol objects - within the structure will themselves be expanded and hashed, producing a - Merkle-tree-like composition of hashes. - - The method should return a deterministic structure whose value depends - only on the semantic content of the object -- not on memory addresses, - object IDs, or other incidental runtime state. - """ + """Protocol for objects that can express their semantic identity as a plain Python structure.""" def identity_structure(self) -> Any: - """ - Return a structure that represents the semantic identity of this object. - - The returned value may be any Python object: - - Primitives (str, int, float, bool, None) are used as-is. - - Collections (list, dict, set, tuple) are recursively traversed. - - Nested ContentIdentifiableProtocol objects are recursively resolved by - the SemanticHasherProtocol: their identity structure is hashed to a - ContentHash hex token, which is then embedded in place of the - object in the parent structure. - - Any type that has a registered TypeHandlerProtocol in the - SemanticHasherProtocol's registry is handled by that handler. + """Return a structure that represents the semantic identity of this object.""" + ... - Returns: - Any: A structure representing this object's semantic content. - Should be deterministic and include all identity-relevant data. - """ + def content_hash(self, hasher: "SemanticHasherProtocol | None" = None) -> ContentHash: + """Returns the content hash.""" ... - def content_hash(self, hasher: SemanticHasherProtocol | None = None) -> ContentHash: - """ - Returns the content hash. - Args: - hasher: Optional semantic hasher to use for the entire recursive - computation. When omitted, resolved from the object's - data_context (or injected hasher for mixin-based objects). - The same hasher propagates to all nested ContentIdentifiable - objects, ensuring one consistent context per computation. - """ - ... +class PythonTypeSemanticHasherProtocol(Protocol): + """Protocol for type-specific semantic hashers used by SemanticAwarePythonHasher. + A ``PythonTypeSemanticHasherProtocol`` hashes a specific Python type to a + ``ContentHash``. Implementations are registered with a + ``PythonTypeSemanticHasherRegistry`` and looked up via MRO-aware resolution. -class TypeHandlerProtocol(Protocol): - """ - Protocol for type-specific serialization handlers used by SemanticHasherProtocol. - - A TypeHandlerProtocol converts a specific Python type into a value that - ``hash_object`` can process. Handlers are registered with a - TypeHandlerRegistry and looked up via MRO-aware resolution. - - The returned value is passed directly back to ``hash_object``, so it may - be anything that ``hash_object`` understands: - - - A primitive (None, bool, int, float, str) -- hashed directly. - - A structure (list, tuple, dict, set, frozenset) -- expanded and hashed. - - A ContentHash -- treated as a terminal; returned as-is without - re-hashing. Use this when the handler has already computed the - definitive hash of the object (e.g. hashing a file's content). - - A ContentIdentifiableProtocol -- its identity_structure() will be called. - - Another registered type -- dispatched through the registry. + Each implementation receives the full ``SemanticAwarePythonHasher`` so it can + delegate hashing of sub-values back to the outer hasher without coupling to a + specific hasher instance. """ - def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: - """ - Convert *obj* into a value that ``hash_object`` can process. + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + """Hash *obj* to a ContentHash. Args: - obj: The object to handle. - hasher: The SemanticHasherProtocol, available if the handler needs to - hash sub-objects explicitly via ``hasher.hash_object()``. + obj: The object to hash. Always matches the registered type. + hasher: The active ``SemanticAwarePythonHasher``. Use + ``hasher.hash_object(sub_value)`` to hash sub-values. Returns: - Any value accepted by ``hash_object``: a primitive, structure, - ContentHash, ContentIdentifiableProtocol, or another registered type. + ContentHash: The content-addressed hash of *obj*. """ ... class SemanticHasherProtocol(Protocol): - """ - Protocol for the semantic content-based hasher. - - ``hash_object(obj)`` is the single recursive entry point. It produces a - ContentHash for any Python object using the following dispatch: - - - ContentHash → terminal; returned as-is - - Primitive → JSON-serialised and hashed directly - - Structure → structurally expanded (type-tagged), then hashed - - Handler match → handler.handle() returns a new value; recurse - - ContentIdentifiableProtocol→ identity_structure() returns a value; recurse - - Unknown → TypeError (strict) or best-effort string (lenient) - - Containers are type-tagged before hashing so that list, tuple, dict, set, - and namedtuple produce distinct hashes even when their elements are equal. - - Unknown types raise TypeError by default (strict mode). Set - strict=False on construction to fall back to a best-effort string - representation with a warning instead. - """ + """Protocol for the semantic content-based hasher.""" def hash_object( self, obj: Any, resolver: Callable[[Any], ContentHash] | None = None, ) -> ContentHash: - """ - Hash *obj* based on its semantic content. - - Args: - obj: The object to hash. - resolver: Optional callable invoked for any ContentIdentifiable - object encountered during hashing. When provided it overrides - the default obj.content_hash() call, allowing the caller to - control which identity chain is used and to propagate a - consistent hasher through the full recursive computation. - - Returns: - ContentHash: Stable, content-based hash of the object. - """ + """Hash *obj* based on its semantic content.""" ... @property def hasher_id(self) -> str: - """ - Returns a unique identifier/name for this hasher instance. - - The hasher_id is embedded in every ContentHash produced by this - hasher, allowing hashes from different versions or configurations - to be distinguished. - """ + """Returns a unique identifier/name for this hasher instance.""" ... @property - def type_handler_registry(self) -> "TypeHandlerRegistry": - """Return the TypeHandlerRegistry used by this hasher.""" + def type_semantic_hasher_registry(self) -> "PythonTypeSemanticHasherRegistry": + """Return the PythonTypeSemanticHasherRegistry used by this hasher.""" ... @@ -269,11 +142,8 @@ def hasher_id(self) -> str: """Unique identifier for this semantic type hasher.""" ... - def hash_column( - self, - column: "pa.Array", - ) -> "pa.Array": - """Hash a column with this semantic type and return the hash bytes an an array""" + def hash_column(self, column: "pa.Array") -> "pa.Array": + """Hash a column with this semantic type and return the hash bytes as an array.""" ... def set_cacher(self, cacher: StringCacherProtocol) -> None: From 55eea4656e1f5f636bf65809a5c037651bd88cc0 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 17:53:11 +0000 Subject: [PATCH 05/33] refactor(hashing_protocols): add SemanticAwarePythonHasher to TYPE_CHECKING imports --- src/orcapod/protocols/hashing_protocols.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index cee17709..e60d9c12 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -10,6 +10,7 @@ if TYPE_CHECKING: import pyarrow as pa from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeSemanticHasherRegistry + from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher @runtime_checkable From 584354463c66a6568ee2fd8a5f7bcf33dd1134af Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 17:54:45 +0000 Subject: [PATCH 06/33] refactor(type_handler_registry): rename to PythonTypeSemanticHasherRegistry, rename methods --- .../semantic_hashing/type_handler_registry.py | 166 +++++------------- 1 file changed, 45 insertions(+), 121 deletions(-) diff --git a/src/orcapod/hashing/semantic_hashing/type_handler_registry.py b/src/orcapod/hashing/semantic_hashing/type_handler_registry.py index 690ec024..ebae2cb5 100644 --- a/src/orcapod/hashing/semantic_hashing/type_handler_registry.py +++ b/src/orcapod/hashing/semantic_hashing/type_handler_registry.py @@ -1,23 +1,5 @@ """ -Type Handler Registry for the SemanticHasherProtocol system. - -Provides a registry through which TypeHandlerProtocol implementations can be -registered for specific Python types. Lookup is MRO-aware: if no handler -is registered for an exact type, the registry walks the MRO of the object's -class to find the nearest ancestor for which a handler has been registered. - -Usage ------ -# Register a handler for a specific type: -registry = TypeHandlerRegistry() -registry.register(Path, PathContentHandler()) - -# Or use the global default registry: -from orcapod.hashing.semantic_hashing.type_handler_registry import get_default_type_handler_registry -get_default_type_handler_registry().register(MyType, MyTypeHandler()) - -# Look up a handler (returns None if not found): -handler = registry.get_handler(some_object) +PythonTypeSemanticHasherRegistry — MRO-aware registry for PythonTypeSemanticHasherProtocol instances. """ from __future__ import annotations @@ -29,21 +11,18 @@ class to find the nearest ancestor for which a handler has been registered. if TYPE_CHECKING: from orcapod.protocols.hashing_protocols import ( ArrowHasherProtocol, - TypeHandlerProtocol, + PythonTypeSemanticHasherProtocol, ) logger = logging.getLogger(__name__) -class TypeHandlerRegistry: - """ - Registry mapping Python types to TypeHandlerProtocol instances. +class PythonTypeSemanticHasherRegistry: + """Registry mapping Python types to PythonTypeSemanticHasherProtocol instances. - Lookup is MRO-aware: when no handler is registered for the exact type of + Lookup is MRO-aware: when no hasher is registered for the exact type of an object, the registry walks the object's MRO (most-derived first) until - it finds a match. This means a handler registered for a base class is - automatically inherited by all subclasses, unless a more specific handler - has been registered for the subclass. + it finds a match. Thread safety ------------- @@ -52,42 +31,28 @@ class TypeHandlerRegistry: """ def __init__( - self, handlers: list[tuple[type, TypeHandlerProtocol]] | None = None + self, handlers: list[tuple[type, "PythonTypeSemanticHasherProtocol"]] | None = None ) -> None: """ Args: - handlers: Optional list of ``(target_type, handler)`` pairs to - register at construction time. Designed for use with - ``parse_objectspec``: the JSON spec provides a list of - two-element arrays where the first element uses ``_type`` - to resolve a Python type and the second uses ``_class`` to - instantiate the handler. + handlers: Optional list of ``(target_type, hasher)`` pairs to + register at construction time. """ - # Maps type -> handler; insertion order is preserved but lookup uses MRO. - self._handlers: dict[type, TypeHandlerProtocol] = {} + self._handlers: dict[type, "PythonTypeSemanticHasherProtocol"] = {} self._lock = threading.RLock() if handlers: for target_type, handler in handlers: self.register(target_type, handler) - # ------------------------------------------------------------------ - # Registration - # ------------------------------------------------------------------ - - def register(self, target_type: type, handler: TypeHandlerProtocol) -> None: - """ - Register a handler for a specific Python type. + def register(self, target_type: type, handler: "PythonTypeSemanticHasherProtocol") -> None: + """Register a hasher for a specific Python type. - If a handler is already registered for *target_type*, it is silently - replaced by the new handler. + If a hasher is already registered for *target_type*, it is silently + replaced by the new hasher. Args: - target_type: The Python type (or class) for which the handler - should be used. Must be a ``type`` object. - handler: A TypeHandlerProtocol instance whose ``handle()`` method will - be called when an object of ``target_type`` (or a - subclass with no more specific handler) is encountered - during structure resolution. + target_type: The Python type (or class) for which the hasher should be used. + handler: A ``PythonTypeSemanticHasherProtocol`` instance. Raises: TypeError: If ``target_type`` is not a ``type``. @@ -100,7 +65,7 @@ def register(self, target_type: type, handler: TypeHandlerProtocol) -> None: existing = self._handlers.get(target_type) if existing is not None and existing is not handler: logger.debug( - "TypeHandlerRegistry: replacing existing handler for %s (%s -> %s)", + "PythonTypeSemanticHasherRegistry: replacing existing hasher for %s (%s -> %s)", target_type.__name__, type(existing).__name__, type(handler).__name__, @@ -108,14 +73,13 @@ def register(self, target_type: type, handler: TypeHandlerProtocol) -> None: self._handlers[target_type] = handler def unregister(self, target_type: type) -> bool: - """ - Remove the handler registered for *target_type*, if any. + """Remove the hasher registered for *target_type*, if any. Args: - target_type: The type whose handler should be removed. + target_type: The type whose hasher should be removed. Returns: - True if a handler was removed, False if none was registered. + True if a hasher was removed, False if none was registered. """ with self._lock: if target_type in self._handlers: @@ -123,59 +87,41 @@ def unregister(self, target_type: type) -> bool: return True return False - # ------------------------------------------------------------------ - # Lookup - # ------------------------------------------------------------------ - - def get_handler(self, obj: Any) -> "TypeHandlerProtocol | None": - """ - Look up the handler for *obj* using MRO-aware resolution. - - The MRO of ``type(obj)`` is walked from most-derived to least-derived - (i.e. the object's own class first, then its bases). The first - match found in the registry is returned. + def get_semantic_hasher(self, obj: Any) -> "PythonTypeSemanticHasherProtocol | None": + """Look up the hasher for *obj* using MRO-aware resolution. Args: - obj: The object for which a handler is needed. + obj: The object for which a hasher is needed. Returns: - The registered TypeHandlerProtocol, or None if no handler is registered - for the object's type or any of its base classes. + The registered ``PythonTypeSemanticHasherProtocol``, or None. """ obj_type = type(obj) with self._lock: - # Fast path: exact type match. handler = self._handlers.get(obj_type) if handler is not None: return handler - - # Slow path: walk the MRO, skipping the type itself (already - # checked above) and skipping ``object`` as a last resort -- a - # handler registered for ``object`` would match everything. for base in obj_type.__mro__[1:]: handler = self._handlers.get(base) if handler is not None: logger.debug( - "TypeHandlerRegistry: resolved handler for %s via base %s", + "PythonTypeSemanticHasherRegistry: resolved hasher for %s via base %s", obj_type.__name__, base.__name__, ) return handler - return None - def get_handler_for_type(self, target_type: type) -> "TypeHandlerProtocol | None": - """ - Look up the handler for a *type object* (rather than an instance). - - Useful when the caller already has the type and wants to check - registration without constructing a dummy instance. + def get_semantic_hasher_for_type( + self, target_type: type + ) -> "PythonTypeSemanticHasherProtocol | None": + """Look up the hasher for a *type object* (rather than an instance). Args: target_type: The type to look up. Returns: - The registered TypeHandlerProtocol, or None. + The registered ``PythonTypeSemanticHasherProtocol``, or None. """ with self._lock: handler = self._handlers.get(target_type) @@ -187,74 +133,52 @@ def get_handler_for_type(self, target_type: type) -> "TypeHandlerProtocol | None return handler return None - def has_handler(self, target_type: type) -> bool: - """ - Return True if a handler is registered for *target_type* or any of - its MRO ancestors. + def has_semantic_hasher(self, target_type: type) -> bool: + """Return True if a hasher is registered for *target_type* or any MRO ancestor. Args: target_type: The type to check. """ - return self.get_handler_for_type(target_type) is not None + return self.get_semantic_hasher_for_type(target_type) is not None def registered_types(self) -> list[type]: - """ - Return a list of all directly-registered types (no MRO expansion). - - Returns: - A snapshot list of types that have explicit handler registrations. - """ + """Return a list of all directly-registered types (no MRO expansion).""" with self._lock: return list(self._handlers.keys()) - # ------------------------------------------------------------------ - # Dunder helpers - # ------------------------------------------------------------------ - def __repr__(self) -> str: with self._lock: names = [t.__name__ for t in self._handlers] - return f"TypeHandlerRegistry(registered={names!r})" + return f"PythonTypeSemanticHasherRegistry(registered={names!r})" def __len__(self) -> int: with self._lock: return len(self._handlers) -# --------------------------------------------------------------------------- -# Pre-populated registry -# --------------------------------------------------------------------------- - - -def get_default_type_handler_registry() -> "TypeHandlerRegistry": - """ - Return the TypeHandlerRegistry from the default data context. +def get_default_python_type_semantic_hasher_registry() -> "PythonTypeSemanticHasherRegistry": + """Return the PythonTypeSemanticHasherRegistry from the default data context. This is a convenience wrapper; the registry is owned and versioned by the - active DataContext. Importing this function from + active ``DataContext``. Importing this function from ``orcapod.hashing.defaults`` or ``orcapod.hashing`` is equivalent. """ from orcapod.hashing.defaults import ( - get_default_type_handler_registry as _get, - ) # stays in hashing/ - + get_default_python_type_semantic_hasher_registry as _get, + ) return _get() -class BuiltinTypeHandlerRegistry(TypeHandlerRegistry): - """ - A TypeHandlerRegistry pre-populated with all built-in handlers. +class BuiltinPythonTypeSemanticHasherRegistry(PythonTypeSemanticHasherRegistry): + """A PythonTypeSemanticHasherRegistry pre-populated with all built-in hashers. Constructed via the data context JSON spec so that the default registry - is versioned alongside the rest of the context components. The built-in - handlers are registered in ``__init__`` so that no separate population - step is required after construction. + is versioned alongside the rest of the context components. """ def __init__(self, arrow_hasher: "ArrowHasherProtocol | None" = None) -> None: super().__init__() from orcapod.hashing.semantic_hashing.builtin_handlers import ( - register_builtin_handlers, + register_builtin_python_type_semantic_hashers, ) - - register_builtin_handlers(self, arrow_hasher=arrow_hasher) + register_builtin_python_type_semantic_hashers(self, arrow_hasher=arrow_hasher) From 1f543f4e63db3b1590f2bbd1a72fec065d439feb Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 17:59:25 +0000 Subject: [PATCH 07/33] =?UTF-8?q?refactor(builtin=5Fhandlers):=20rename=20?= =?UTF-8?q?handler=20classes,=20tighten=20hash()=20=E2=86=92=20ContentHash?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../semantic_hashing/builtin_handlers.py | 435 +++++------------- 1 file changed, 127 insertions(+), 308 deletions(-) diff --git a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py index 1b66d039..48e7dc12 100644 --- a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py +++ b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py @@ -1,30 +1,20 @@ """ -Built-in TypeHandlerProtocol implementations for the SemanticHasherProtocol system. - -This module provides handlers for all Python types that the SemanticHasherProtocol -knows how to process out of the box: - - - PathContentHandler -- pathlib.Path: returns ContentHash of file content - - UPathContentHandler -- upath.UPath: returns ContentHash of file content (remote-aware) - - UUIDHandler -- uuid.UUID: raw 16-byte binary representation - - BytesHandler -- bytes / bytearray: hex string representation - - FunctionHandler -- callable with __code__: via FunctionInfoExtractorProtocol - - TypeObjectHandler -- type objects (classes): stable "type:" string - -Note: ContentHash requires no handler -- it is recognised as a terminal by -``hash_object`` and returned as-is. - -The module also exposes ``register_builtin_handlers(registry)`` which is -called automatically when the global default registry is first accessed. - -Extending the system --------------------- -To add a handler for a third-party type, create a class that implements the -TypeHandlerProtocol protocol (a single ``handle(obj, hasher)`` method) and register -it: - - from orcapod.hashing.semantic_hashing.type_handler_registry import get_default_type_handler_registry - get_default_type_handler_registry().register(MyType, MyTypeHandler()) +Built-in PythonTypeSemanticHasherProtocol implementations. + + PathSemanticHasher -- pathlib.Path: file content hash + UPathSemanticHasher -- upath.UPath: file content hash (remote-aware) + UUIDSemanticHasher -- uuid.UUID: 16-byte binary representation + BytesSemanticHasher -- bytes/bytearray: hex string representation + FunctionSemanticHasher -- callable with __code__: via FunctionInfoExtractorProtocol + TypeObjectSemanticHasher -- type objects: stable "type:." string + SpecialFormSemanticHasher -- typing._SpecialForm + GenericAliasSemanticHasher -- generic alias type annotations + UnionTypeSemanticHasher -- types.UnionType (Python 3.10+ X | Y syntax) + ArrowTableSemanticHasher -- pa.Table / pa.RecordBatch + SchemaSemanticHasher -- Schema objects + +``register_builtin_python_type_semantic_hashers(registry)`` populates a registry +with all of the above. """ from __future__ import annotations @@ -36,442 +26,271 @@ from upath import UPath -from orcapod.types import PathLike, Schema +from orcapod.types import ContentHash, PathLike, Schema if TYPE_CHECKING: from orcapod.hashing.semantic_hashing.type_handler_registry import ( - TypeHandlerRegistry, + PythonTypeSemanticHasherRegistry, ) + from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher from orcapod.protocols.hashing_protocols import ( ArrowHasherProtocol, FileContentHasherProtocol, - SemanticHasherProtocol, ) logger = logging.getLogger(__name__) -# --------------------------------------------------------------------------- -# Individual handlers -# --------------------------------------------------------------------------- - - -class PathContentHandler: - """ - Handler for pathlib.Path objects. - - Hashes the *content* of the file at the given path using the injected - FileContentHasherProtocol, producing a stable content-addressed identifier. - The resulting bytes are stored as a hex string embedded in the resolved - structure. - - The path must refer to an existing, readable file. Directories and - missing paths are not supported and will raise an error -- if you need - a path-as-string handler, register a separate handler for that use case - or return a ``str`` from ``identity_structure()`` instead of a ``Path``. +class PathSemanticHasher: + """Hasher for pathlib.Path objects — hashes file *content*. Args: - file_hasher: Any object with a ``hash_file(path) -> ContentHash`` - method (satisfies the FileContentHasherProtocol protocol). + file_hasher: Any object with a ``hash_file(path) -> ContentHash`` method. """ - def __init__(self, file_hasher: FileContentHasherProtocol) -> None: + def __init__(self, file_hasher: "FileContentHasherProtocol") -> None: self.file_hasher = file_hasher - def handle(self, obj: PathLike, hasher: "SemanticHasherProtocol") -> Any: + def hash(self, obj: PathLike, hasher: "SemanticAwarePythonHasher") -> ContentHash: path: Path = Path(obj) - if not path.exists(): raise FileNotFoundError( - f"PathContentHandler: path does not exist: {path!r}. " - "Paths must refer to existing files for content-based hashing. " - "If you intended to hash the path string, return str(path) from " - "identity_structure() instead of a Path object." + f"PathSemanticHasher: path does not exist: {path!r}. " + "Paths must refer to existing files for content-based hashing." ) - if path.is_dir(): raise IsADirectoryError( - f"PathContentHandler: path is a directory: {path!r}. " + f"PathSemanticHasher: path is a directory: {path!r}. " "Only regular files are supported for content-based hashing." ) - - logger.debug("PathContentHandler: hashing file content at %s", path) + logger.debug("PathSemanticHasher: hashing file content at %s", path) return self.file_hasher.hash_file(path) -class UPathContentHandler: - """ - Handler for universal_pathlib.UPath objects. - - Behaves identically to ``PathContentHandler`` but preserves the UPath - instance so that remote filesystem semantics (e.g. S3, GCS) are retained - during file content hashing. +class UPathSemanticHasher: + """Hasher for universal_pathlib.UPath objects — hashes file content. Args: - file_hasher: Any object with a ``hash_file(path) -> ContentHash`` - method (satisfies the FileContentHasherProtocol protocol). + file_hasher: Any object with a ``hash_file(path) -> ContentHash`` method. """ - def __init__(self, file_hasher: FileContentHasherProtocol) -> None: + def __init__(self, file_hasher: "FileContentHasherProtocol") -> None: self.file_hasher = file_hasher - def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: if not isinstance(obj, UPath): raise TypeError( - f"UPathContentHandler: expected a UPath, got {type(obj)!r}. " - "Use PathContentHandler for pathlib.Path objects." + f"UPathSemanticHasher: expected a UPath, got {type(obj)!r}." ) - if not obj.exists(): raise FileNotFoundError( - f"UPathContentHandler: path does not exist: {obj!r}. " - "Paths must refer to existing files for content-based hashing." + f"UPathSemanticHasher: path does not exist: {obj!r}." ) - if obj.is_dir(): raise IsADirectoryError( - f"UPathContentHandler: path is a directory: {obj!r}. " - "Only regular files are supported for content-based hashing." + f"UPathSemanticHasher: path is a directory: {obj!r}." ) - - logger.debug("UPathContentHandler: hashing file content at %s", obj) + logger.debug("UPathSemanticHasher: hashing file content at %s", obj) return self.file_hasher.hash_file(obj) -class UUIDHandler: - """Handler for ``uuid.UUID`` objects. - - Returns the raw 16-byte binary representation of the UUID. - The binary form is compact, unambiguous, and independent of string - formatting conventions. UUID values in data columns are stored as - ``pa.binary(16)`` (fixed-size) within the struct type used by - ``UUIDStructConverter``; database record IDs use ``pa.large_binary()``. - """ - - def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: - return obj.bytes +class UUIDSemanticHasher: + """Hasher for ``uuid.UUID`` objects — hashes the raw 16-byte binary representation.""" + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + return hasher.hash_object(obj.bytes) -class BytesHandler: - """ - Handler for bytes and bytearray objects. - Converts binary data to its lowercase hex string representation. This - avoids JSON serialisation issues with raw bytes while preserving the - exact byte sequence in the hash input. - """ +class BytesSemanticHasher: + """Hasher for bytes and bytearray objects — hashes the lowercase hex representation.""" - def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: if isinstance(obj, (bytes, bytearray)): - return obj.hex() - raise TypeError(f"BytesHandler: expected bytes or bytearray, got {type(obj)!r}") - + return hasher.hash_object(obj.hex()) + raise TypeError( + f"BytesSemanticHasher: expected bytes or bytearray, got {type(obj)!r}" + ) -class FunctionHandler: - """ - Handler for Python functions / callables that carry a ``__code__`` attribute. - Delegates to a FunctionInfoExtractorProtocol to produce a stable, serialisable - dict representation of the function. The extractor is responsible for - deciding which parts of the function (name, signature, source body, etc.) - are included. +class FunctionSemanticHasher: + """Hasher for Python functions/callables with a ``__code__`` attribute. Args: function_info_extractor: Any object with an - ``extract_function_info(func) -> dict`` method (satisfies the - FunctionInfoExtractorProtocol protocol). + ``extract_function_info(func) -> dict`` method. """ def __init__(self, function_info_extractor: Any) -> None: self.function_info_extractor = function_info_extractor - def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: if not (callable(obj) and hasattr(obj, "__code__")): raise TypeError( - f"FunctionHandler: expected a callable with __code__, got {type(obj)!r}" + f"FunctionSemanticHasher: expected a callable with __code__, got {type(obj)!r}" ) func_name = getattr(obj, "__name__", repr(obj)) - logger.debug("FunctionHandler: extracting info for function %r", func_name) + logger.debug("FunctionSemanticHasher: extracting info for function %r", func_name) info: dict[str, Any] = self.function_info_extractor.extract_function_info(obj) - return info + return hasher.hash_object(info) -class TypeObjectHandler: - """ - Handler for type objects (i.e. classes passed as values). +class TypeObjectSemanticHasher: + """Hasher for type objects (classes passed as values). - Returns a stable string of the form ``"type:."`` so - that different classes always produce different hash inputs and the - result is human-readable. + Returns a stable string of the form ``"type:."``. """ - def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: if not isinstance(obj, type): raise TypeError( - f"TypeObjectHandler: expected a type/class, got {type(obj)!r}" + f"TypeObjectSemanticHasher: expected a type/class, got {type(obj)!r}" ) module: str = obj.__module__ or "" qualname: str = obj.__qualname__ - return f"type:{module}.{qualname}" + return hasher.hash_object(f"type:{module}.{qualname}") -class SpecialFormHandler: - """ - Handler for ``typing._SpecialForm`` objects such as ``typing.Union`` and - ``typing.ClassVar``. - - These appear as the ``__origin__`` of typing generics — for example, - ``Optional[int]`` is ``Union[int, None]``, whose ``__origin__`` is - ``typing.Union``. Returns a stable string of the form - ``"special_form:typing."`` so they can be safely embedded as the - origin component inside a ``GenericAliasHandler`` result. - """ +class SpecialFormSemanticHasher: + """Hasher for ``typing._SpecialForm`` objects such as ``typing.Union``.""" - def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: name = getattr(obj, "_name", None) or repr(obj) - return f"special_form:typing.{name}" + return hasher.hash_object(f"special_form:typing.{name}") -class GenericAliasHandler: - """ - Handler for generic alias type annotations such as ``dict[int, list[int]]`` - (``types.GenericAlias``) and ``typing`` generics (``typing._GenericAlias``). - - Produces a stable dict containing the origin type and a list of hashed - argument types so that structurally identical generic annotations always - yield the same hash, and structurally different ones yield different hashes. +class GenericAliasSemanticHasher: + """Hasher for generic alias type annotations (``dict[int, str]``, ``Optional[X]``, etc.).""" - When the origin is ``typing.Union`` (i.e. ``typing.Optional[X]`` or - ``typing.Union[X, Y]``), the handler produces a canonical ``"union"`` - form with sorted args — identical to `UnionTypeHandler` — so that - ``typing.Optional[int]`` and ``int | None`` hash equivalently. - """ - - def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: import typing origin = getattr(obj, "__origin__", None) args = getattr(obj, "__args__", None) or () if origin is None: - return f"generic_alias:{obj!r}" - - # Normalize typing.Union / typing.Optional to the canonical union - # form so that typing.Optional[int] ≡ typing.Union[int, None] ≡ int | None. + return hasher.hash_object(f"generic_alias:{obj!r}") if origin is typing.Union: hashed_args = sorted(hasher.hash_object(arg).to_string() for arg in args) - return { - "__type__": "union", - "args": hashed_args, - } - - return { + return hasher.hash_object({"__type__": "union", "args": hashed_args}) + return hasher.hash_object({ "__type__": "generic_alias", "origin": hasher.hash_object(origin).to_string(), "args": [hasher.hash_object(arg).to_string() for arg in args], - } + }) -class UnionTypeHandler: - """ - Handler for ``types.UnionType`` objects (Python 3.10+ ``X | Y`` syntax). - - ``str | None``, ``int | float``, etc. produce a ``types.UnionType`` at - runtime, which is distinct from ``typing.Union[str, None]`` - (a ``typing._GenericAlias``). This handler normalises union types into - a canonical ``"union"`` form with sorted args — identical to the union - branch in `GenericAliasHandler` — so that ``int | None``, - ``typing.Optional[int]``, and ``typing.Union[int, None]`` all hash - equivalently. - """ +class UnionTypeSemanticHasher: + """Hasher for ``types.UnionType`` objects (Python 3.10+ ``X | Y`` syntax).""" - def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: args = getattr(obj, "__args__", None) or () hashed_args = sorted(hasher.hash_object(arg).to_string() for arg in args) - return { - "__type__": "union", - "args": hashed_args, - } + return hasher.hash_object({"__type__": "union", "args": hashed_args}) -class ArrowTableHandler: - """ - Handler for ``pa.Table`` and ``pa.RecordBatch`` objects. - - Delegates to the injected ``ArrowHasherProtocol`` to produce a stable, - content-addressed ``ContentHash`` of the Arrow table data. The returned - ``ContentHash`` is recognised as a terminal by ``hash_object`` and - returned as-is — no further recursion occurs. +class ArrowTableSemanticHasher: + """Hasher for ``pa.Table`` and ``pa.RecordBatch`` objects. Args: - arrow_hasher: Any object satisfying ArrowHasherProtocol (i.e. has a - ``hash_table(table) -> ContentHash`` method). + arrow_hasher: Any object satisfying ``ArrowHasherProtocol``. """ - def __init__(self, arrow_hasher: ArrowHasherProtocol) -> None: + def __init__(self, arrow_hasher: "ArrowHasherProtocol") -> None: self.arrow_hasher = arrow_hasher - def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: import pyarrow as _pa if isinstance(obj, _pa.RecordBatch): obj = _pa.Table.from_batches([obj]) if not isinstance(obj, _pa.Table): raise TypeError( - f"ArrowTableHandler: expected pa.Table or pa.RecordBatch, got {type(obj)!r}" + f"ArrowTableSemanticHasher: expected pa.Table or pa.RecordBatch, got {type(obj)!r}" ) return self.arrow_hasher.hash_table(obj) -class SchemaHandler: - """ - Handler for `Schema` objects. - - Produces a stable dict containing both the field-type mapping and the - sorted list of optional field names, so that two schemas differing only - in which fields are optional produce different hashes. - """ +class SchemaSemanticHasher: + """Hasher for ``Schema`` objects.""" - def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: if not isinstance(obj, Schema): - raise TypeError(f"SchemaHandler: expected a Schema, got {type(obj)!r}") - # schema handler is not implemented yet - raise NotImplementedError() - # visited: frozenset[int] = frozenset() - - # return { - # "fields": {k: hasher._expand_element(v, visited) for k, v in obj.items()}, - # "optional_fields": sorted(obj.optional_fields), - # } - - -# --------------------------------------------------------------------------- -# Registration helper -# --------------------------------------------------------------------------- + raise TypeError( + f"SchemaSemanticHasher: expected a Schema, got {type(obj)!r}" + ) + raise NotImplementedError("SchemaSemanticHasher is not yet implemented.") -def register_builtin_handlers( - registry: "TypeHandlerRegistry", +def register_builtin_python_type_semantic_hashers( + registry: "PythonTypeSemanticHasherRegistry", file_hasher: Any = None, function_info_extractor: Any = None, arrow_hasher: "ArrowHasherProtocol | None" = None, ) -> None: - """ - Register all built-in TypeHandlers into *registry*. - - This function is called automatically when the global default registry is - first accessed via ``get_default_type_handler_registry()``. It can also - be called manually to populate a custom registry. - - Path, function, and Arrow table handling require auxiliary objects. - When these are not supplied, sensible defaults are constructed: + """Register all built-in semantic hashers into *registry*. - - ``BasicFileHasher`` (SHA-256, 64 KiB buffer) for Path handling. - - ``FunctionSignatureExtractor`` for function handling. - - ``SemanticArrowHasher`` (SHA-256, logical serialisation) for Arrow table handling. + When ``arrow_hasher`` is None, ``pa.Table`` and ``pa.RecordBatch`` handlers + are **not** registered (to avoid circular dependency in the JSON context + construction — the default context's ``python_type_semantic_hasher_registry`` + is built before ``arrow_hasher``). Args: - registry: - The TypeHandlerRegistry to populate. - file_hasher: - Optional object satisfying FileContentHasherProtocol (i.e. has a - ``hash_file(path) -> ContentHash`` method). Defaults to a - ``BasicFileHasher`` configured with SHA-256. - function_info_extractor: - Optional object satisfying FunctionInfoExtractorProtocol (i.e. has an - ``extract_function_info(func) -> dict`` method). Defaults to - ``FunctionSignatureExtractor``. - arrow_hasher: - Optional object satisfying ArrowHasherProtocol (i.e. has a - ``hash_table(table) -> ContentHash`` method). Defaults to a - ``SemanticArrowHasher`` configured with SHA-256 and logical serialisation. - Should be the data context's arrow hasher when called from a versioned - context so that hashing is consistent across all components. + registry: The ``PythonTypeSemanticHasherRegistry`` to populate. + file_hasher: Optional ``FileContentHasherProtocol`` for path hashing. + Defaults to ``BasicFileHasher(sha256)``. + function_info_extractor: Optional ``FunctionInfoExtractorProtocol``. + Defaults to ``FunctionSignatureExtractor``. + arrow_hasher: Optional ``ArrowHasherProtocol`` for nested table hashing. + When None, Arrow table handlers are skipped. """ - # Resolve defaults for auxiliary objects ---------------------------- if file_hasher is None: - from orcapod.hashing.file_hashers import BasicFileHasher # stays in hashing/ - + from orcapod.hashing.file_hashers import BasicFileHasher file_hasher = BasicFileHasher(algorithm="sha256") if function_info_extractor is None: from orcapod.hashing.semantic_hashing.function_info_extractors import ( FunctionSignatureExtractor, ) - function_info_extractor = FunctionSignatureExtractor( include_module=True, include_defaults=True, ) - if arrow_hasher is None: - from orcapod.hashing.arrow_hashers import SemanticArrowHasher - from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry - - arrow_hasher = SemanticArrowHasher( - semantic_registry=SemanticTypeRegistry(), - hasher_id="arrow_v0.1", - hash_algorithm="sha256", - serialization_method="logical", - ) - - # Register handlers ------------------------------------------------- - - # bytes / bytearray - bytes_handler = BytesHandler() - registry.register(bytes, bytes_handler) - registry.register(bytearray, bytes_handler) - - # pathlib.Path (and subclasses such as PosixPath / WindowsPath) - registry.register(Path, PathContentHandler(file_hasher)) - - # uuid.UUID - registry.register(UUID, UUIDHandler()) + bytes_hasher = BytesSemanticHasher() + registry.register(bytes, bytes_hasher) + registry.register(bytearray, bytes_hasher) - # Note: ContentHash needs no handler -- SemanticHasherProtocol treats it as - # a terminal in hash_object() and returns it as-is. + registry.register(Path, PathSemanticHasher(file_hasher)) + registry.register(UPath, UPathSemanticHasher(file_hasher)) + registry.register(UUID, UUIDSemanticHasher()) - # Functions -- register types.FunctionType so MRO lookup works for - # plain ``def`` functions, plus built-in functions and bound methods. import types as _types - function_handler = FunctionHandler(function_info_extractor) - registry.register(_types.FunctionType, function_handler) - registry.register(_types.BuiltinFunctionType, function_handler) - registry.register(_types.MethodType, function_handler) + function_hasher = FunctionSemanticHasher(function_info_extractor) + registry.register(_types.FunctionType, function_hasher) + registry.register(_types.BuiltinFunctionType, function_hasher) + registry.register(_types.MethodType, function_hasher) - # type objects (classes used as values, e.g. passed in a dict) - registry.register(type, TypeObjectHandler()) + registry.register(type, TypeObjectSemanticHasher()) + registry.register(_types.UnionType, UnionTypeSemanticHasher()) - # types.UnionType (Python 3.10+ X | Y syntax, e.g. str | None) - registry.register(_types.UnionType, UnionTypeHandler()) - - # generic alias type annotations: dict[int, str], list[str], etc. - generic_alias_handler = GenericAliasHandler() - registry.register(_types.GenericAlias, generic_alias_handler) - # typing._GenericAlias covers Optional[X], Union[X, Y], Dict[K, V], etc. - # typing._SpecialForm covers typing.Union, typing.ClassVar, etc. which - # appear as __origin__ on those generics (e.g. Optional[int].__origin__ - # is typing.Union, a _SpecialForm). + generic_alias_hasher = GenericAliasSemanticHasher() + registry.register(_types.GenericAlias, generic_alias_hasher) try: import typing as _typing - - registry.register(_typing._GenericAlias, generic_alias_handler) # type: ignore[attr-defined] - registry.register(_typing._SpecialForm, SpecialFormHandler()) # type: ignore[attr-defined] + registry.register(_typing._GenericAlias, generic_alias_hasher) # type: ignore[attr-defined] + registry.register(_typing._SpecialForm, SpecialFormSemanticHasher()) # type: ignore[attr-defined] except AttributeError: pass - # Schema objects -- must come after type handler so Schema is matched - # specifically rather than falling through to the Mapping expansion path - registry.register(Schema, SchemaHandler()) + registry.register(Schema, SchemaSemanticHasher()) - # Arrow tables and record batches -- delegate to the injected arrow hasher - import pyarrow as _pa - - arrow_table_handler = ArrowTableHandler(arrow_hasher) - registry.register(_pa.Table, arrow_table_handler) - registry.register(_pa.RecordBatch, arrow_table_handler) + if arrow_hasher is not None: + import pyarrow as _pa + arrow_table_hasher = ArrowTableSemanticHasher(arrow_hasher) + registry.register(_pa.Table, arrow_table_hasher) + registry.register(_pa.RecordBatch, arrow_table_hasher) logger.debug( - "register_builtin_handlers: registered %d built-in handlers", + "register_builtin_python_type_semantic_hashers: registered %d hashers", len(registry), ) From 895f885580aede217464706c239e9b965949a6a7 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 18:08:50 +0000 Subject: [PATCH 08/33] =?UTF-8?q?refactor(semantic=5Fhasher):=20rename=20B?= =?UTF-8?q?aseSemanticHasher=20=E2=86=92=20SemanticAwarePythonHasher,=20si?= =?UTF-8?q?mplify=20dispatch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also updates defaults.py: replaces get_default_type_handler_registry with get_default_python_type_semantic_hasher_registry to match the new registry API. --- src/orcapod/hashing/defaults.py | 14 +++-- .../semantic_hashing/semantic_hasher.py | 57 ++++++++++--------- 2 files changed, 37 insertions(+), 34 deletions(-) diff --git a/src/orcapod/hashing/defaults.py b/src/orcapod/hashing/defaults.py index d00e0e3a..0082c453 100644 --- a/src/orcapod/hashing/defaults.py +++ b/src/orcapod/hashing/defaults.py @@ -10,22 +10,24 @@ # from its JSON spec. Constructing them here would bypass versioning and # produce hashers that are decoupled from the active data context. -from orcapod.hashing.semantic_hashing.type_handler_registry import TypeHandlerRegistry +from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeSemanticHasherRegistry from orcapod.protocols import hashing_protocols as hp -def get_default_type_handler_registry() -> TypeHandlerRegistry: +def get_default_python_type_semantic_hasher_registry() -> PythonTypeSemanticHasherRegistry: """ - Return the TypeHandlerRegistry from the default data context's semantic hasher. + Return the ``PythonTypeSemanticHasherRegistry`` from the default data context's + semantic hasher. - The registry is owned by the active ``BaseSemanticHasher``, which is itself + The registry is owned by the active ``SemanticAwarePythonHasher``, which is itself versioned inside the active ``DataContext``. Returns: - TypeHandlerRegistry: The type handler registry from the default data context. + PythonTypeSemanticHasherRegistry: The type semantic hasher registry from the + default data context. """ from orcapod.contexts import get_default_context - return get_default_context().semantic_hasher.type_handler_registry + return get_default_context().semantic_hasher.type_semantic_hasher_registry def get_default_semantic_hasher() -> hp.SemanticHasherProtocol: diff --git a/src/orcapod/hashing/semantic_hashing/semantic_hasher.py b/src/orcapod/hashing/semantic_hashing/semantic_hasher.py index 79714fb8..bcc18b51 100644 --- a/src/orcapod/hashing/semantic_hashing/semantic_hasher.py +++ b/src/orcapod/hashing/semantic_hashing/semantic_hasher.py @@ -1,5 +1,5 @@ """ -BaseSemanticHasher -- content-based recursive object hasher. +SemanticAwarePythonHasher -- content-based recursive object hasher. Algorithm --------- @@ -13,7 +13,7 @@ - Primitive → JSON-serialise + SHA-256 - Structure → delegate to ``_expand_structure``, then JSON-serialise the resulting tagged tree + SHA-256 - - Handler match → call handler.handle(obj), recurse via hash_object + - Semantic hasher match → semantic_hasher.hash(obj, self) returns ContentHash directly - ContentIdentifiableProtocol→ call identity_structure(), recurse via hash_object - Fallback → strict error or best-effort string, then hash @@ -69,7 +69,7 @@ from collections.abc import Callable, Mapping from typing import Any -from orcapod.hashing.semantic_hashing.type_handler_registry import TypeHandlerRegistry +from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeSemanticHasherRegistry from orcapod.protocols import hashing_protocols as hp from orcapod.types import ContentHash @@ -79,7 +79,7 @@ _MEMADDR_RE = re.compile(r" at 0x[0-9a-fA-F]+") -class BaseSemanticHasher: +class SemanticAwarePythonHasher: """ Content-based recursive hasher. @@ -88,9 +88,10 @@ class BaseSemanticHasher: hasher_id: A short string identifying this hasher version/configuration. Embedded in every ContentHash produced. - type_handler_registry: - TypeHandlerRegistry for MRO-aware lookup of TypeHandlerProtocol instances. - If None, the default registry from the active DataContext is used. + type_semantic_hasher_registry: + ``PythonTypeSemanticHasherRegistry`` for MRO-aware lookup of + ``PythonTypeSemanticHasherProtocol`` instances. + If None, the default registry is used. strict: When True (default) raises TypeError for unhandled types. When False falls back to a best-effort string representation. @@ -99,18 +100,17 @@ class BaseSemanticHasher: def __init__( self, hasher_id: str, - type_handler_registry: TypeHandlerRegistry | None = None, + type_semantic_hasher_registry: PythonTypeSemanticHasherRegistry | None = None, strict: bool = True, ) -> None: self._hasher_id = hasher_id self._strict = strict - if type_handler_registry is None: - from orcapod.hashing.defaults import get_default_type_handler_registry - - self._registry = get_default_type_handler_registry() # stays in hashing/ + if type_semantic_hasher_registry is None: + from orcapod.hashing.defaults import get_default_python_type_semantic_hasher_registry + self._registry = get_default_python_type_semantic_hasher_registry() else: - self._registry = type_handler_registry + self._registry = type_semantic_hasher_registry # ------------------------------------------------------------------ # Public API @@ -125,8 +125,8 @@ def strict(self) -> bool: return self._strict @property - def type_handler_registry(self) -> TypeHandlerRegistry: - """Return the ``TypeHandlerRegistry`` used by this hasher.""" + def type_semantic_hasher_registry(self) -> PythonTypeSemanticHasherRegistry: + """Return the ``PythonTypeSemanticHasherRegistry`` used by this hasher.""" return self._registry def hash_object( @@ -143,7 +143,7 @@ def hash_object( - ContentHash → terminal; returned as-is - Primitive → JSON-serialised and hashed directly - Structure → structurally expanded then hashed - - Handler match → handler produces a value, recurse + - Semantic hasher match → semantic_hasher.hash(obj, self) returns ContentHash directly - ContentIdentifiableProtocol→ resolver(obj) if resolver provided, else obj.content_hash() - Unknown type → TypeError in strict mode; best-effort otherwise @@ -174,15 +174,15 @@ def hash_object( ) return self._hash_to_content_hash(expanded) - # Handler dispatch: the handler produces a new value; recurse. - handler = self._registry.get_handler(obj) - if handler is not None: + # Semantic hasher dispatch: the hasher produces a ContentHash directly. + semantic_hasher = self._registry.get_semantic_hasher(obj) + if semantic_hasher is not None: logger.debug( - "hash_object: dispatching %s to handler %s", + "hash_object: dispatching %s to semantic hasher %s", type(obj).__name__, - type(handler).__name__, + type(semantic_hasher).__name__, ) - return self.hash_object(handler.handle(obj, self), resolver=resolver) + return semantic_hasher.hash(obj, self) # ContentIdentifiableProtocol: use resolver if provided, else content_hash(). if isinstance(obj, hp.ContentIdentifiableProtocol): @@ -359,9 +359,9 @@ def _hash_to_content_hash(self, obj: Any) -> ContentHash: ).encode("utf-8") except (TypeError, ValueError) as exc: raise TypeError( - f"BaseSemanticHasher: failed to JSON-serialise object of type " - f"{type(obj).__name__!r}. Ensure all TypeHandlers and " - "identity_structure() implementations return JSON-serialisable " + f"SemanticAwarePythonHasher: failed to JSON-serialise object of type " + f"{type(obj).__name__!r}. Ensure all PythonTypeSemanticHasherProtocol " + "implementations and identity_structure() return JSON-serialisable " "primitives or structures." ) from exc @@ -383,9 +383,10 @@ def _handle_unknown(self, obj: Any) -> str: if self._strict: raise TypeError( - f"BaseSemanticHasher (strict): no TypeHandlerProtocol registered for type " - f"'{qualified}' and it does not implement ContentIdentifiableProtocol. " - "Register a TypeHandlerProtocol via the TypeHandlerRegistry or implement " + f"SemanticAwarePythonHasher (strict): no PythonTypeSemanticHasherProtocol " + f"registered for type '{qualified}' and it does not implement " + "ContentIdentifiableProtocol. Register a PythonTypeSemanticHasherProtocol " + "via the PythonTypeSemanticHasherRegistry or implement " "identity_structure() on the class." ) From b965fc54ad704e4e6d8888a23441dc91faa936b9 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 18:14:09 +0000 Subject: [PATCH 09/33] =?UTF-8?q?refactor:=20update=20BaseSemanticHasher?= =?UTF-8?q?=20=E2=86=92=20SemanticAwarePythonHasher=20refs=20in=20mixin=20?= =?UTF-8?q?and=20core?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/orcapod/contexts/core.py | 4 +-- .../content_identifiable_mixin.py | 25 +++++++++---------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/orcapod/contexts/core.py b/src/orcapod/contexts/core.py index c02dc985..6b4aa2ca 100644 --- a/src/orcapod/contexts/core.py +++ b/src/orcapod/contexts/core.py @@ -21,8 +21,8 @@ class DataContext: registration. This is the single public API for all type operations. arrow_hasher: Arrow table hasher for this context semantic_hasher: General semantic hasher for this context. The - ``TypeHandlerRegistry`` used for hashing is accessible via - ``semantic_hasher.type_handler_registry``. + ``PythonTypeSemanticHasherRegistry`` used for hashing is accessible via + ``semantic_hasher.type_semantic_hasher_registry``. """ context_key: str diff --git a/src/orcapod/hashing/semantic_hashing/content_identifiable_mixin.py b/src/orcapod/hashing/semantic_hashing/content_identifiable_mixin.py index f4bd04ce..effa94ad 100644 --- a/src/orcapod/hashing/semantic_hashing/content_identifiable_mixin.py +++ b/src/orcapod/hashing/semantic_hashing/content_identifiable_mixin.py @@ -3,14 +3,14 @@ Any class that implements ``identity_structure()`` can inherit from this mixin to gain a full suite of content-based identity helpers without having to wire -up a BaseSemanticHasher manually: +up a ``SemanticAwarePythonHasher`` manually: - ``content_hash()`` -- returns a stable ContentHash for the object - ``__hash__()`` -- Python hash based on content (int) - ``__eq__()`` -- equality via content_hash comparison -The mixin uses the global default BaseSemanticHasher by default, but accepts an -injected hasher for testing or custom configurations. +The mixin uses the global default ``SemanticAwarePythonHasher`` by default, but +accepts an injected hasher for testing or custom configurations. Usage ----- @@ -32,7 +32,7 @@ def identity_structure(self): With an injected hasher (e.g. in tests):: - hasher = BaseSemanticHasher(hasher_id="test", strict=True) + hasher = SemanticAwarePythonHasher(hasher_id="test", strict=True) record = MyRecord("foo", 42) record._semantic_hasher = hasher print(record.content_hash()) @@ -65,7 +65,7 @@ def identity_structure(self): import logging from typing import Any -from orcapod.hashing.semantic_hashing.semantic_hasher import BaseSemanticHasher +from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher from orcapod.types import ContentHash logger = logging.getLogger(__name__) @@ -82,19 +82,19 @@ def identity_structure(self) -> Any: ... The returned structure is recursively resolved and hashed by the - BaseSemanticHasher to produce a stable ContentHash. + ``SemanticAwarePythonHasher`` to produce a stable ContentHash. Parameters (passed as keyword arguments to ``__init__``) --------------------------------------------------------- semantic_hasher: - Optional BaseSemanticHasher instance to use. When omitted, the hasher - is obtained from the default data context via + Optional ``SemanticAwarePythonHasher`` instance to use. When omitted, + the hasher is obtained from the default data context via ``orcapod.contexts.get_default_context().semantic_hasher``, which is the single source of truth for versioned component configuration. """ def __init__( - self, *, semantic_hasher: BaseSemanticHasher | None = None, **kwargs: Any + self, *, semantic_hasher: SemanticAwarePythonHasher | None = None, **kwargs: Any ) -> None: # Cooperative MRO-friendly init -- forward remaining kwargs up the chain. super().__init__(**kwargs) @@ -215,9 +215,8 @@ def _invalidate_content_hash_cache(self) -> None: # Hasher resolution # ------------------------------------------------------------------ - def _get_hasher(self) -> BaseSemanticHasher: - """ - Return the BaseSemanticHasher to use for this object. + def _get_hasher(self) -> SemanticAwarePythonHasher: + """Return the ``SemanticAwarePythonHasher`` to use for this object. Resolution order: 1. The instance-level ``_semantic_hasher`` attribute (set at @@ -230,7 +229,7 @@ def _get_hasher(self) -> BaseSemanticHasher: type converter, etc.) that belong to the same context. Returns: - BaseSemanticHasher: The hasher to use. + SemanticAwarePythonHasher: The hasher to use. """ if self._semantic_hasher is not None: return self._semantic_hasher From 0b8ae996a7d9537b61147b098ff1dc8c99f3e8fb Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 18:15:01 +0000 Subject: [PATCH 10/33] refactor(hashing): update __init__.py exports and versioned_hashers for rename --- src/orcapod/hashing/__init__.py | 125 +++++++----------- .../hashing/semantic_hashing/__init__.py | 67 +++++----- src/orcapod/hashing/versioned_hashers.py | 42 ++---- 3 files changed, 90 insertions(+), 144 deletions(-) diff --git a/src/orcapod/hashing/__init__.py b/src/orcapod/hashing/__init__.py index 8055509b..ceb0b059 100644 --- a/src/orcapod/hashing/__init__.py +++ b/src/orcapod/hashing/__init__.py @@ -3,23 +3,20 @@ Public API ---------- - BaseSemanticHasher -- content-based recursive object hasher (concrete) - SemanticHasherProtocol -- protocol for semantic hashers - TypeHandlerRegistry -- registry mapping types to TypeHandlerProtocol instances - get_default_semantic_hasher -- global default SemanticHasherProtocol factory - get_default_type_handler_registry -- global default TypeHandlerRegistry factory - ContentIdentifiableMixin -- convenience mixin for content-identifiable objects + SemanticAwarePythonHasher -- content-based recursive object hasher + SemanticHasherProtocol -- protocol for semantic hashers + PythonTypeSemanticHasherRegistry -- registry mapping types to PythonTypeSemanticHasherProtocol instances + get_default_semantic_hasher -- global default SemanticHasherProtocol factory + get_default_python_type_semantic_hasher_registry -- global default registry factory + ContentIdentifiableMixin -- convenience mixin for content-identifiable objects -Built-in handlers (importable for custom registry setup): - PathContentHandler - UUIDHandler - BytesHandler - FunctionHandler - TypeObjectHandler - register_builtin_handlers - -Legacy names (kept for backward compatibility): - HashableMixin -- legacy mixin from legacy_core (deprecated) +Built-in hashers (importable for custom registry setup): + PathSemanticHasher + UUIDSemanticHasher + BytesSemanticHasher + FunctionSemanticHasher + TypeObjectSemanticHasher + register_builtin_python_type_semantic_hashers Utility: FileContentHasherProtocol @@ -28,41 +25,40 @@ ArrowHasherProtocol """ -# --------------------------------------------------------------------------- -# New API -- SemanticHasherProtocol, registry, mixin -# --------------------------------------------------------------------------- - -# --------------------------------------------------------------------------- -# Default hasher factories -# --------------------------------------------------------------------------- from orcapod.hashing.defaults import ( get_default_arrow_hasher, + get_default_python_type_semantic_hasher_registry, get_default_semantic_hasher, - get_default_type_handler_registry, ) - -# --------------------------------------------------------------------------- -# File hashing utilities -# --------------------------------------------------------------------------- from orcapod.hashing.file_hashers import BasicFileHasher, CachedFileHasher from orcapod.hashing.hash_utils import hash_file from orcapod.hashing.semantic_hashing.builtin_handlers import ( - BytesHandler, - FunctionHandler, - PathContentHandler, - TypeObjectHandler, - UUIDHandler, - register_builtin_handlers, + BytesSemanticHasher, + FunctionSemanticHasher, + PathSemanticHasher, + TypeObjectSemanticHasher, + UUIDSemanticHasher, + register_builtin_python_type_semantic_hashers, ) from orcapod.hashing.semantic_hashing.content_identifiable_mixin import ( ContentIdentifiableMixin, ) +from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher +from orcapod.hashing.semantic_hashing.type_handler_registry import ( + BuiltinPythonTypeSemanticHasherRegistry, + PythonTypeSemanticHasherRegistry, +) +from orcapod.protocols.hashing_protocols import ( + ArrowHasherProtocol, + ContentIdentifiableProtocol, + FileContentHasherProtocol, + FunctionInfoExtractorProtocol, + PythonTypeSemanticHasherProtocol, + SemanticHasherProtocol, + SemanticTypeHasherProtocol, + StringCacherProtocol, +) -# --------------------------------------------------------------------------- -# Legacy API (deprecated -- kept for backward compatibility) -# These imports are guarded because legacy_core.py has pre-existing import -# issues (e.g. references to removed types) that should not block the new API. -# --------------------------------------------------------------------------- try: from orcapod.hashing.legacy_core import ( HashableMixin, @@ -85,60 +81,31 @@ hash_to_hex = None # type: ignore[assignment] hash_to_int = None # type: ignore[assignment] hash_to_uuid = None # type: ignore[assignment] -from orcapod.hashing.semantic_hashing.semantic_hasher import BaseSemanticHasher -from orcapod.hashing.semantic_hashing.type_handler_registry import ( - BuiltinTypeHandlerRegistry, - TypeHandlerRegistry, -) - -# --------------------------------------------------------------------------- -# Protocols (re-exported for convenience) -# --------------------------------------------------------------------------- -from orcapod.protocols.hashing_protocols import ( - ArrowHasherProtocol, - ContentIdentifiableProtocol, - FileContentHasherProtocol, - FunctionInfoExtractorProtocol, - SemanticHasherProtocol, - SemanticTypeHasherProtocol, - StringCacherProtocol, - TypeHandlerProtocol, -) - -# --------------------------------------------------------------------------- -# __all__ -- defines the public surface of this package -# --------------------------------------------------------------------------- __all__ = [ - # ---- New API: concrete implementation ---- - "BaseSemanticHasher", - "TypeHandlerRegistry", - "BuiltinTypeHandlerRegistry", - "get_default_type_handler_registry", + "SemanticAwarePythonHasher", + "PythonTypeSemanticHasherRegistry", + "BuiltinPythonTypeSemanticHasherRegistry", + "get_default_python_type_semantic_hasher_registry", "get_default_semantic_hasher", "ContentIdentifiableMixin", - # Built-in handlers - "PathContentHandler", - "UUIDHandler", - "BytesHandler", - "FunctionHandler", - "TypeObjectHandler", - "register_builtin_handlers", - # ---- Protocols ---- + "PathSemanticHasher", + "UUIDSemanticHasher", + "BytesSemanticHasher", + "FunctionSemanticHasher", + "TypeObjectSemanticHasher", + "register_builtin_python_type_semantic_hashers", "SemanticHasherProtocol", "ContentIdentifiableProtocol", - "TypeHandlerProtocol", + "PythonTypeSemanticHasherProtocol", "FileContentHasherProtocol", "ArrowHasherProtocol", "StringCacherProtocol", "FunctionInfoExtractorProtocol", "SemanticTypeHasherProtocol", - # ---- File hashing ---- "BasicFileHasher", "CachedFileHasher", "hash_file", - # ---- Legacy / backward-compatible ---- - # TODO: remove legacy section "get_default_arrow_hasher", "HashableMixin", "hash_to_hex", diff --git a/src/orcapod/hashing/semantic_hashing/__init__.py b/src/orcapod/hashing/semantic_hashing/__init__.py index bc120c18..db0eb765 100644 --- a/src/orcapod/hashing/semantic_hashing/__init__.py +++ b/src/orcapod/hashing/semantic_hashing/__init__.py @@ -1,34 +1,32 @@ """ orcapod.hashing.semantic_hashing ================================= -Sub-package containing all components of the semantic hashing system: + SemanticAwarePythonHasher -- content-based recursive object hasher + PythonTypeSemanticHasherRegistry -- MRO-aware registry mapping types → PythonTypeSemanticHasherProtocol + BuiltinPythonTypeSemanticHasherRegistry -- pre-populated registry with built-in hashers + ContentIdentifiableMixin -- convenience mixin for content-identifiable objects - BaseSemanticHasher -- content-based recursive object hasher - TypeHandlerRegistry -- MRO-aware registry mapping types → TypeHandlerProtocol - BuiltinTypeHandlerRegistry -- pre-populated registry with built-in handlers - ContentIdentifiableMixin -- convenience mixin for content-identifiable objects +Built-in PythonTypeSemanticHasherProtocol implementations: + PathSemanticHasher -- pathlib.Path → file-content hash + UUIDSemanticHasher -- uuid.UUID → canonical bytes + BytesSemanticHasher -- bytes/bytearray → hex string + FunctionSemanticHasher -- callable → via FunctionInfoExtractorProtocol + TypeObjectSemanticHasher -- type objects → "type:." + register_builtin_python_type_semantic_hashers -- populate a registry with all of the above -Built-in TypeHandlerProtocol implementations: - PathContentHandler -- pathlib.Path → file-content hash - UUIDHandler -- uuid.UUID → canonical string - BytesHandler -- bytes/bytearray → hex string - FunctionHandler -- callable → via FunctionInfoExtractorProtocol - TypeObjectHandler -- type objects → "type:." - register_builtin_handlers -- populate a registry with all of the above - -Function info extractors (used by FunctionHandler): +Function info extractors (used by FunctionSemanticHasher): FunctionNameExtractor FunctionSignatureExtractor FunctionInfoExtractorFactory """ from orcapod.hashing.semantic_hashing.builtin_handlers import ( - BytesHandler, - FunctionHandler, - PathContentHandler, - TypeObjectHandler, - UUIDHandler, - register_builtin_handlers, + BytesSemanticHasher, + FunctionSemanticHasher, + PathSemanticHasher, + TypeObjectSemanticHasher, + UUIDSemanticHasher, + register_builtin_python_type_semantic_hashers, ) from orcapod.hashing.semantic_hashing.content_identifiable_mixin import ( ContentIdentifiableMixin, @@ -38,28 +36,23 @@ FunctionNameExtractor, FunctionSignatureExtractor, ) -from orcapod.hashing.semantic_hashing.semantic_hasher import BaseSemanticHasher +from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher from orcapod.hashing.semantic_hashing.type_handler_registry import ( - BuiltinTypeHandlerRegistry, - TypeHandlerRegistry, + BuiltinPythonTypeSemanticHasherRegistry, + PythonTypeSemanticHasherRegistry, ) __all__ = [ - # Core hasher - "BaseSemanticHasher", - # Registry - "TypeHandlerRegistry", - "BuiltinTypeHandlerRegistry", - # Mixin + "SemanticAwarePythonHasher", + "PythonTypeSemanticHasherRegistry", + "BuiltinPythonTypeSemanticHasherRegistry", "ContentIdentifiableMixin", - # Built-in handlers - "PathContentHandler", - "UUIDHandler", - "BytesHandler", - "FunctionHandler", - "TypeObjectHandler", - "register_builtin_handlers", - # Function info extractors + "PathSemanticHasher", + "UUIDSemanticHasher", + "BytesSemanticHasher", + "FunctionSemanticHasher", + "TypeObjectSemanticHasher", + "register_builtin_python_type_semantic_hashers", "FunctionNameExtractor", "FunctionSignatureExtractor", "FunctionInfoExtractorFactory", diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index 1e7b7255..b12bd2d3 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -49,52 +49,38 @@ def get_versioned_semantic_hasher( hasher_id: str = _CURRENT_SEMANTIC_HASHER_ID, strict: bool = True, - type_handler_registry: "hp.TypeHandlerRegistry | None" = None, # type: ignore[name-defined] + type_semantic_hasher_registry: "Any | None" = None, ) -> hp.SemanticHasherProtocol: - """ - Return a SemanticHasherProtocol configured for the current version. - - The returned hasher uses the global default TypeHandlerRegistry (which - is pre-populated with all built-in handlers) unless an explicit registry - is supplied. + """Return a SemanticAwarePythonHasher configured for the current version. Parameters ---------- hasher_id: Identifier embedded in every ContentHash produced by this hasher. - Defaults to the current version constant. Override only when - producing hashes that must be tagged with a specific version string. strict: - When True (the default) the hasher raises TypeError on encountering - an object of an unhandled type. When False it falls back to a - best-effort string representation with a logged warning. - type_handler_registry: - Optional TypeHandlerRegistry to inject. When None the global - default registry is used (recommended for production code). - - Returns - ------- - SemanticHasherProtocol - A fully configured SemanticHasherProtocol instance. + When True raises TypeError for unhandled types. When False falls back + to a best-effort string representation. + type_semantic_hasher_registry: + Optional ``PythonTypeSemanticHasherRegistry`` to inject. When None the + global default registry is used. """ - from orcapod.hashing.semantic_hashing.semantic_hasher import BaseSemanticHasher + from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher - if type_handler_registry is None: + if type_semantic_hasher_registry is None: from orcapod.hashing.semantic_hashing.type_handler_registry import ( - get_default_type_handler_registry, + get_default_python_type_semantic_hasher_registry, ) - - type_handler_registry = get_default_type_handler_registry() + type_semantic_hasher_registry = get_default_python_type_semantic_hasher_registry() logger.debug( - "get_versioned_semantic_hasher: creating BaseSemanticHasher " + "get_versioned_semantic_hasher: creating SemanticAwarePythonHasher " "(hasher_id=%r, strict=%r)", hasher_id, strict, ) - return BaseSemanticHasher( + return SemanticAwarePythonHasher( hasher_id=hasher_id, - type_handler_registry=type_handler_registry, + type_semantic_hasher_registry=type_semantic_hasher_registry, strict=strict, ) From 21cedfc8cf5077eea2f3fed8c44be31669386719 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 18:16:25 +0000 Subject: [PATCH 11/33] refactor(contexts): update v0.1.json context spec to use renamed class names --- src/orcapod/contexts/data/v0.1.json | 36 ++++++++++++++--------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json index de52d5bf..9555b823 100644 --- a/src/orcapod/contexts/data/v0.1.json +++ b/src/orcapod/contexts/data/v0.1.json @@ -86,32 +86,32 @@ } }, "type_handler_registry": { - "_class": "orcapod.hashing.semantic_hashing.type_handler_registry.TypeHandlerRegistry", + "_class": "orcapod.hashing.semantic_hashing.type_handler_registry.PythonTypeSemanticHasherRegistry", "_config": { "handlers": [ - [{"_type": "builtins.bytes"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesHandler", "_config": {}}], - [{"_type": "builtins.bytearray"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesHandler", "_config": {}}], - [{"_type": "pathlib.Path"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.PathContentHandler", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], - [{"_type": "upath.core.UPath"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UPathContentHandler", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], - [{"_type": "uuid.UUID"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UUIDHandler", "_config": {}}], - [{"_type": "types.FunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], - [{"_type": "types.BuiltinFunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], - [{"_type": "types.MethodType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], - [{"_type": "builtins.type"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.TypeObjectHandler", "_config": {}}], - [{"_type": "types.GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasHandler", "_config": {}}], - [{"_type": "types.UnionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UnionTypeHandler", "_config": {}}], - [{"_type": "typing._GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasHandler", "_config": {}}], - [{"_type": "typing._SpecialForm"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.SpecialFormHandler", "_config": {}}], - [{"_type": "pyarrow.Table"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableHandler", "_config": {"arrow_hasher": {"_ref": "arrow_hasher"}}}], - [{"_type": "pyarrow.RecordBatch"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableHandler", "_config": {"arrow_hasher": {"_ref": "arrow_hasher"}}}] + [{"_type": "builtins.bytes"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesSemanticHasher", "_config": {}}], + [{"_type": "builtins.bytearray"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesSemanticHasher", "_config": {}}], + [{"_type": "pathlib.Path"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.PathSemanticHasher", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], + [{"_type": "upath.core.UPath"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UPathSemanticHasher", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], + [{"_type": "uuid.UUID"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UUIDSemanticHasher", "_config": {}}], + [{"_type": "types.FunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionSemanticHasher", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], + [{"_type": "types.BuiltinFunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionSemanticHasher", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], + [{"_type": "types.MethodType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionSemanticHasher", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], + [{"_type": "builtins.type"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.TypeObjectSemanticHasher", "_config": {}}], + [{"_type": "types.GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasSemanticHasher", "_config": {}}], + [{"_type": "types.UnionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UnionTypeSemanticHasher", "_config": {}}], + [{"_type": "typing._GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasSemanticHasher", "_config": {}}], + [{"_type": "typing._SpecialForm"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.SpecialFormSemanticHasher", "_config": {}}], + [{"_type": "pyarrow.Table"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableSemanticHasher", "_config": {"arrow_hasher": {"_ref": "arrow_hasher"}}}], + [{"_type": "pyarrow.RecordBatch"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableSemanticHasher", "_config": {"arrow_hasher": {"_ref": "arrow_hasher"}}}] ] } }, "semantic_hasher": { - "_class": "orcapod.hashing.semantic_hashing.semantic_hasher.BaseSemanticHasher", + "_class": "orcapod.hashing.semantic_hashing.semantic_hasher.SemanticAwarePythonHasher", "_config": { "hasher_id": "semantic_v0.1", - "type_handler_registry": { + "type_semantic_hasher_registry": { "_ref": "type_handler_registry" } } From 305735c4e4231f26cb75fb374487187c8dda492a Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 18:21:39 +0000 Subject: [PATCH 12/33] refactor(tests): update hashing tests for renamed classes and methods --- .../test_file_hashing_consistency.py | 20 +- tests/test_hashing/test_semantic_hasher.py | 254 +++++++++--------- tests/test_hashing/test_uuid_handler.py | 53 ++-- 3 files changed, 174 insertions(+), 153 deletions(-) diff --git a/tests/test_hashing/test_file_hashing_consistency.py b/tests/test_hashing/test_file_hashing_consistency.py index e5bd4bbf..70412e9d 100644 --- a/tests/test_hashing/test_file_hashing_consistency.py +++ b/tests/test_hashing/test_file_hashing_consistency.py @@ -3,8 +3,8 @@ 1. **Arrow hasher path**: SemanticArrowHasher processes an Arrow table containing a path struct column → calls PythonPathStructConverter.hash_struct_dict → file_hasher. -2. **Semantic hasher path**: BaseSemanticHasher hashes a Python Path object → - calls PathContentHandler.handle → file_hasher. +2. **Semantic hasher path**: SemanticAwarePythonHasher hashes a Python Path object → + calls PathSemanticHasher.handle → file_hasher. Both paths must delegate to the same FileContentHasherProtocol so that identical file content always produces identical hashes, regardless of entry point. @@ -18,10 +18,10 @@ from orcapod.hashing.arrow_hashers import SemanticArrowHasher from orcapod.hashing.file_hashers import BasicFileHasher from orcapod.hashing.semantic_hashing.builtin_handlers import ( - register_builtin_handlers, + register_builtin_python_type_semantic_hashers, ) -from orcapod.hashing.semantic_hashing.semantic_hasher import BaseSemanticHasher -from orcapod.hashing.semantic_hashing.type_handler_registry import TypeHandlerRegistry +from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher +from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeSemanticHasherRegistry from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter @@ -52,11 +52,11 @@ def arrow_hasher(path_converter): @pytest.fixture def semantic_hasher(file_hasher): - """BaseSemanticHasher wired with the shared file_hasher via PathContentHandler.""" - registry = TypeHandlerRegistry() - register_builtin_handlers(registry, file_hasher=file_hasher) - return BaseSemanticHasher( - hasher_id="test_v1", type_handler_registry=registry, strict=True + """SemanticAwarePythonHasher wired with the shared file_hasher via PathSemanticHasher.""" + registry = PythonTypeSemanticHasherRegistry() + register_builtin_python_type_semantic_hashers(registry, file_hasher=file_hasher) + return SemanticAwarePythonHasher( + hasher_id="test_v1", type_semantic_hasher_registry=registry, strict=True ) diff --git a/tests/test_hashing/test_semantic_hasher.py b/tests/test_hashing/test_semantic_hasher.py index b2719b4a..873db99f 100644 --- a/tests/test_hashing/test_semantic_hasher.py +++ b/tests/test_hashing/test_semantic_hasher.py @@ -1,17 +1,17 @@ """ -Comprehensive test suite for the BaseSemanticHasher system. +Comprehensive test suite for the SemanticAwarePythonHasher system. Covers: - - BaseSemanticHasher: primitives, container type-tagging, determinism, + - SemanticAwarePythonHasher: primitives, container type-tagging, determinism, circular references, strict vs non-strict mode - ContentIdentifiableProtocol protocol: independent hashing, composability - - TypeHandlerRegistry: registration, MRO-aware lookup, unregister - - Built-in handlers: bytes, UUID, Path, functions, type objects + - PythonTypeSemanticHasherRegistry: registration, MRO-aware lookup, unregister + - Built-in hashers: bytes, UUID, Path, functions, type objects - ContentHash as terminal: returned as-is without re-hashing - ContentIdentifiableMixin: content_hash, __eq__, __hash__, caching, cache invalidation, injectable hasher - - Custom type handler registration and extension - - get_default_semantic_hasher / get_default_type_handler_registry + - Custom type hasher registration and extension + - get_default_semantic_hasher / get_default_python_type_semantic_hasher_registry """ from __future__ import annotations @@ -27,17 +27,19 @@ import pytest from orcapod.hashing.defaults import get_default_semantic_hasher -from orcapod.hashing.semantic_hashing.builtin_handlers import register_builtin_handlers +from orcapod.hashing.semantic_hashing.builtin_handlers import ( + register_builtin_python_type_semantic_hashers, +) from orcapod.hashing.semantic_hashing.content_identifiable_mixin import ( ContentIdentifiableMixin, ) from orcapod.hashing.semantic_hashing.semantic_hasher import ( - BaseSemanticHasher, + SemanticAwarePythonHasher, _is_namedtuple, ) from orcapod.hashing.semantic_hashing.type_handler_registry import ( - TypeHandlerRegistry, - get_default_type_handler_registry, + PythonTypeSemanticHasherRegistry, + get_default_python_type_semantic_hasher_registry, ) from orcapod.types import ContentHash @@ -46,22 +48,22 @@ # --------------------------------------------------------------------------- -def make_hasher(strict: bool = True) -> BaseSemanticHasher: - """Create a fresh BaseSemanticHasher with an isolated registry.""" - registry = TypeHandlerRegistry() - register_builtin_handlers(registry) - return BaseSemanticHasher( - hasher_id="test_v1", type_handler_registry=registry, strict=strict +def make_hasher(strict: bool = True) -> SemanticAwarePythonHasher: + """Create a fresh SemanticAwarePythonHasher with an isolated registry.""" + registry = PythonTypeSemanticHasherRegistry() + register_builtin_python_type_semantic_hashers(registry) + return SemanticAwarePythonHasher( + hasher_id="test_v1", type_semantic_hasher_registry=registry, strict=strict ) @pytest.fixture -def hasher() -> BaseSemanticHasher: +def hasher() -> SemanticAwarePythonHasher: return make_hasher(strict=True) @pytest.fixture -def lenient_hasher() -> BaseSemanticHasher: +def lenient_hasher() -> SemanticAwarePythonHasher: return make_hasher(strict=False) @@ -108,7 +110,7 @@ def identity_structure(self) -> Any: # --------------------------------------------------------------------------- -# 1. BaseSemanticHasher: primitives +# 1. SemanticAwarePythonHasher: primitives # --------------------------------------------------------------------------- @@ -152,7 +154,7 @@ def test_same_primitive_same_hash(self, hasher): # --------------------------------------------------------------------------- -# 2. BaseSemanticHasher: container type-tagging and determinism +# 2. SemanticAwarePythonHasher: container type-tagging and determinism # --------------------------------------------------------------------------- @@ -213,7 +215,7 @@ def test_hash_returns_content_hash(self, hasher): # --------------------------------------------------------------------------- -# 3. BaseSemanticHasher: namedtuples +# 3. SemanticAwarePythonHasher: namedtuples # --------------------------------------------------------------------------- @@ -249,7 +251,7 @@ def test_is_namedtuple_helper(self): # --------------------------------------------------------------------------- -# 4. BaseSemanticHasher: circular references +# 4. SemanticAwarePythonHasher: circular references # --------------------------------------------------------------------------- @@ -284,7 +286,7 @@ def test_circular_differs_from_non_circular(self, hasher): # --------------------------------------------------------------------------- -# 5. BaseSemanticHasher: strict vs non-strict mode +# 5. SemanticAwarePythonHasher: strict vs non-strict mode # --------------------------------------------------------------------------- @@ -297,7 +299,7 @@ def __init__(self, x: int) -> None: class TestStrictMode: def test_strict_raises_on_unknown_type(self, hasher): - with pytest.raises(TypeError, match="no TypeHandlerProtocol registered"): + with pytest.raises(TypeError, match="no PythonTypeSemanticHasherProtocol registered"): hasher.hash_object(Unhandled(1)) def test_non_strict_returns_content_hash(self, lenient_hasher): @@ -310,8 +312,8 @@ def test_non_strict_same_object_same_hash(self, lenient_hasher): assert h1 == h2 def test_strict_mode_flag(self): - strict = BaseSemanticHasher(hasher_id="s", strict=True) - lenient = BaseSemanticHasher(hasher_id="s", strict=False) + strict = SemanticAwarePythonHasher(hasher_id="s", strict=True) + lenient = SemanticAwarePythonHasher(hasher_id="s", strict=False) assert strict.strict is True assert lenient.strict is False @@ -795,7 +797,7 @@ def test_usable_in_set(self, hasher): assert len(s) == 2 def test_injectable_hasher(self): - custom_hasher = BaseSemanticHasher(hasher_id="injected_v9") + custom_hasher = SemanticAwarePythonHasher(hasher_id="injected_v9") rec = SimpleRecord("foo", 1, semantic_hasher=custom_hasher) assert rec.content_hash().method == "injected_v9" @@ -820,7 +822,7 @@ def test_repr_includes_hash(self, hasher): # --------------------------------------------------------------------------- -# 14. TypeHandlerRegistry +# 14. PythonTypeSemanticHasherRegistry # --------------------------------------------------------------------------- @@ -828,7 +830,7 @@ class _DummyHandler: def __init__(self, tag: str) -> None: self.tag = tag - def handle(self, obj: Any, hasher: Any) -> Any: + def hash(self, obj: Any, hasher: Any) -> Any: return f"{self.tag}:{obj}" @@ -844,78 +846,78 @@ class GrandChild(Child): pass -class TestTypeHandlerRegistry: +class TestPythonTypeSemanticHasherRegistry: def test_register_and_get_exact(self): - reg = TypeHandlerRegistry() + reg = PythonTypeSemanticHasherRegistry() h = _DummyHandler("base") reg.register(Base, h) - assert reg.get_handler(Base()) is h + assert reg.get_semantic_hasher(Base()) is h def test_mro_lookup_child(self): - reg = TypeHandlerRegistry() + reg = PythonTypeSemanticHasherRegistry() h = _DummyHandler("base") reg.register(Base, h) - assert reg.get_handler(Child()) is h + assert reg.get_semantic_hasher(Child()) is h def test_mro_lookup_grandchild(self): - reg = TypeHandlerRegistry() + reg = PythonTypeSemanticHasherRegistry() h = _DummyHandler("base") reg.register(Base, h) - assert reg.get_handler(GrandChild()) is h + assert reg.get_semantic_hasher(GrandChild()) is h def test_more_specific_handler_wins(self): - reg = TypeHandlerRegistry() + reg = PythonTypeSemanticHasherRegistry() h_base = _DummyHandler("base") h_child = _DummyHandler("child") reg.register(Base, h_base) reg.register(Child, h_child) - assert reg.get_handler(Child()) is h_child - assert reg.get_handler(GrandChild()) is h_child + assert reg.get_semantic_hasher(Child()) is h_child + assert reg.get_semantic_hasher(GrandChild()) is h_child def test_unregistered_returns_none(self): - reg = TypeHandlerRegistry() - assert reg.get_handler(Base()) is None + reg = PythonTypeSemanticHasherRegistry() + assert reg.get_semantic_hasher(Base()) is None def test_unregister_removes_handler(self): - reg = TypeHandlerRegistry() + reg = PythonTypeSemanticHasherRegistry() h = _DummyHandler("base") reg.register(Base, h) assert reg.unregister(Base) is True - assert reg.get_handler(Base()) is None + assert reg.get_semantic_hasher(Base()) is None def test_unregister_nonexistent_returns_false(self): - reg = TypeHandlerRegistry() + reg = PythonTypeSemanticHasherRegistry() assert reg.unregister(Base) is False def test_replace_existing_handler(self): - reg = TypeHandlerRegistry() + reg = PythonTypeSemanticHasherRegistry() h1 = _DummyHandler("first") h2 = _DummyHandler("second") reg.register(Base, h1) reg.register(Base, h2) - assert reg.get_handler(Base()) is h2 + assert reg.get_semantic_hasher(Base()) is h2 def test_register_non_type_raises(self): - reg = TypeHandlerRegistry() + reg = PythonTypeSemanticHasherRegistry() with pytest.raises(TypeError): reg.register("not_a_type", _DummyHandler("x")) # type: ignore[arg-type] def test_has_handler_exact(self): - reg = TypeHandlerRegistry() + reg = PythonTypeSemanticHasherRegistry() reg.register(Base, _DummyHandler("b")) - assert reg.has_handler(Base) is True + assert reg.has_semantic_hasher(Base) is True def test_has_handler_via_mro(self): - reg = TypeHandlerRegistry() + reg = PythonTypeSemanticHasherRegistry() reg.register(Base, _DummyHandler("b")) - assert reg.has_handler(Child) is True + assert reg.has_semantic_hasher(Child) is True def test_has_handler_false(self): - reg = TypeHandlerRegistry() - assert reg.has_handler(Base) is False + reg = PythonTypeSemanticHasherRegistry() + assert reg.has_semantic_hasher(Base) is False def test_registered_types_snapshot(self): - reg = TypeHandlerRegistry() + reg = PythonTypeSemanticHasherRegistry() reg.register(Base, _DummyHandler("b")) reg.register(Child, _DummyHandler("c")) types = reg.registered_types() @@ -923,7 +925,7 @@ def test_registered_types_snapshot(self): assert Child in types def test_len(self): - reg = TypeHandlerRegistry() + reg = PythonTypeSemanticHasherRegistry() assert len(reg) == 0 reg.register(Base, _DummyHandler("b")) assert len(reg) == 1 @@ -931,12 +933,12 @@ def test_len(self): assert len(reg) == 2 def test_get_handler_for_type(self): - reg = TypeHandlerRegistry() + reg = PythonTypeSemanticHasherRegistry() h = _DummyHandler("b") reg.register(Base, h) - assert reg.get_handler_for_type(Base) is h - assert reg.get_handler_for_type(Child) is h # via MRO - assert reg.get_handler_for_type(int) is None + assert reg.get_semantic_hasher_for_type(Base) is h + assert reg.get_semantic_hasher_for_type(Child) is h # via MRO + assert reg.get_semantic_hasher_for_type(int) is None # --------------------------------------------------------------------------- @@ -950,53 +952,53 @@ def __init__(self, degrees: float) -> None: class CelsiusHandler: - def handle(self, obj: Any, hasher: Any) -> Any: - return {"__type__": "Celsius", "degrees": obj.degrees} + def hash(self, obj: Any, hasher: Any) -> ContentHash: + return hasher.hash_object({"__type__": "Celsius", "degrees": obj.degrees}) class TestCustomHandlerRegistration: def test_register_custom_type(self): - registry = TypeHandlerRegistry() - register_builtin_handlers(registry) + registry = PythonTypeSemanticHasherRegistry() + register_builtin_python_type_semantic_hashers(registry) registry.register(Celsius, CelsiusHandler()) - custom_hasher = BaseSemanticHasher( - hasher_id="custom_v1", type_handler_registry=registry, strict=True + custom_hasher = SemanticAwarePythonHasher( + hasher_id="custom_v1", type_semantic_hasher_registry=registry, strict=True ) assert isinstance(custom_hasher.hash_object(Celsius(100.0)), ContentHash) def test_custom_handler_determinism(self): - registry = TypeHandlerRegistry() - register_builtin_handlers(registry) + registry = PythonTypeSemanticHasherRegistry() + register_builtin_python_type_semantic_hashers(registry) registry.register(Celsius, CelsiusHandler()) - custom_hasher = BaseSemanticHasher( - hasher_id="custom_v1", type_handler_registry=registry + custom_hasher = SemanticAwarePythonHasher( + hasher_id="custom_v1", type_semantic_hasher_registry=registry ) h1 = custom_hasher.hash_object(Celsius(37.5)) h2 = custom_hasher.hash_object(Celsius(37.5)) assert h1 == h2 def test_custom_handler_different_values_differ(self): - registry = TypeHandlerRegistry() - register_builtin_handlers(registry) + registry = PythonTypeSemanticHasherRegistry() + register_builtin_python_type_semantic_hashers(registry) registry.register(Celsius, CelsiusHandler()) - custom_hasher = BaseSemanticHasher( - hasher_id="custom_v1", type_handler_registry=registry + custom_hasher = SemanticAwarePythonHasher( + hasher_id="custom_v1", type_semantic_hasher_registry=registry ) assert custom_hasher.hash_object(Celsius(0.0)) != custom_hasher.hash_object( Celsius(100.0) ) def test_unregistered_type_still_strict(self): - hasher = BaseSemanticHasher(hasher_id="strict_v1", strict=True) + hasher = SemanticAwarePythonHasher(hasher_id="strict_v1", strict=True) with pytest.raises(TypeError): hasher.hash_object(Celsius(42.0)) def test_custom_handler_in_nested_structure(self): - registry = TypeHandlerRegistry() - register_builtin_handlers(registry) + registry = PythonTypeSemanticHasherRegistry() + register_builtin_python_type_semantic_hashers(registry) registry.register(Celsius, CelsiusHandler()) - custom_hasher = BaseSemanticHasher( - hasher_id="custom_v1", type_handler_registry=registry + custom_hasher = SemanticAwarePythonHasher( + hasher_id="custom_v1", type_semantic_hasher_registry=registry ) h = custom_hasher.hash_object({"temp": Celsius(36.6), "unit": "C"}) assert isinstance(h, ContentHash) @@ -1005,14 +1007,14 @@ def test_handler_returning_content_hash_is_terminal(self): """A handler that returns a ContentHash must not be re-hashed.""" class DirectHashHandler: - def handle(self, obj: Any, hasher: Any) -> ContentHash: + def hash(self, obj: Any, hasher: Any) -> ContentHash: return ContentHash("direct", b"\xaa" * 32) - registry = TypeHandlerRegistry() - register_builtin_handlers(registry) + registry = PythonTypeSemanticHasherRegistry() + register_builtin_python_type_semantic_hashers(registry) registry.register(Celsius, DirectHashHandler()) - custom_hasher = BaseSemanticHasher( - hasher_id="custom_v1", type_handler_registry=registry + custom_hasher = SemanticAwarePythonHasher( + hasher_id="custom_v1", type_semantic_hasher_registry=registry ) result = custom_hasher.hash_object(Celsius(0.0)) # The ContentHash returned by the handler should come back as-is @@ -1022,11 +1024,11 @@ def test_mro_aware_custom_handler(self): class FancyCelsius(Celsius): pass - registry = TypeHandlerRegistry() - register_builtin_handlers(registry) + registry = PythonTypeSemanticHasherRegistry() + register_builtin_python_type_semantic_hashers(registry) registry.register(Celsius, CelsiusHandler()) - custom_hasher = BaseSemanticHasher( - hasher_id="custom_v1", type_handler_registry=registry + custom_hasher = SemanticAwarePythonHasher( + hasher_id="custom_v1", type_semantic_hasher_registry=registry ) h = custom_hasher.hash_object(FancyCelsius(20.0)) assert isinstance(h, ContentHash) @@ -1039,10 +1041,10 @@ def __init__(self, k: float) -> None: self.k = k class KelvinHandler: - def handle(self, obj: Any, hasher: Any) -> Any: - return {"__type__": "Kelvin", "k": obj.k} + def hash(self, obj: Any, hasher: Any) -> ContentHash: + return hasher.hash_object({"__type__": "Kelvin", "k": obj.k}) - global_registry = get_default_type_handler_registry() + global_registry = get_default_python_type_semantic_hasher_registry() global_registry.register(Kelvin, KelvinHandler()) try: default_hasher = get_default_semantic_hasher() @@ -1058,14 +1060,14 @@ def handle(self, obj: Any, hasher: Any) -> Any: class TestGlobalSingletons: def test_get_default_semantic_hasher_returns_semantic_hasher(self): - assert isinstance(get_default_semantic_hasher(), BaseSemanticHasher) + assert isinstance(get_default_semantic_hasher(), SemanticAwarePythonHasher) def test_get_default_semantic_hasher_has_versioned_id(self): assert get_default_semantic_hasher().hasher_id == "semantic_v0.1" def test_get_default_type_handler_registry_is_singleton(self): - r1 = get_default_type_handler_registry() - r2 = get_default_type_handler_registry() + r1 = get_default_python_type_semantic_hasher_registry() + r2 = get_default_python_type_semantic_hasher_registry() assert r1 is r2 def test_default_registry_has_builtin_handlers(self): @@ -1073,22 +1075,22 @@ def test_default_registry_has_builtin_handlers(self): import typing as _typing - reg = get_default_type_handler_registry() - assert reg.has_handler(bytes) - assert reg.has_handler(bytearray) - assert reg.has_handler(UUID) - assert reg.has_handler(Path) - assert reg.has_handler(_types.FunctionType) - assert reg.has_handler(type) - assert reg.has_handler(_types.GenericAlias) - assert reg.has_handler(_types.UnionType) - assert reg.has_handler(_typing._GenericAlias) # type: ignore[attr-defined] - assert reg.has_handler(_typing._SpecialForm) # type: ignore[attr-defined] + reg = get_default_python_type_semantic_hasher_registry() + assert reg.has_semantic_hasher(bytes) + assert reg.has_semantic_hasher(bytearray) + assert reg.has_semantic_hasher(UUID) + assert reg.has_semantic_hasher(Path) + assert reg.has_semantic_hasher(_types.FunctionType) + assert reg.has_semantic_hasher(type) + assert reg.has_semantic_hasher(_types.GenericAlias) + assert reg.has_semantic_hasher(_types.UnionType) + assert reg.has_semantic_hasher(_typing._GenericAlias) # type: ignore[attr-defined] + assert reg.has_semantic_hasher(_typing._SpecialForm) # type: ignore[attr-defined] def test_default_registry_has_no_content_hash_handler(self): """ContentHash is handled as a terminal -- no registry entry needed.""" - reg = get_default_type_handler_registry() - assert not reg.has_handler(ContentHash) + reg = get_default_python_type_semantic_hasher_registry() + assert not reg.has_semantic_hasher(ContentHash) def test_default_hasher_can_hash_common_types(self): h = get_default_semantic_hasher() @@ -1118,7 +1120,7 @@ def test_content_hash_conversion_methods(self): def _sha256_json(obj: Any, hasher_id: str) -> "ContentHash": - """Manually JSON-serialize *obj* with the same settings as BaseSemanticHasher + """Manually JSON-serialize *obj* with the same settings as SemanticAwarePythonHasher and return the resulting ContentHash.""" json_bytes = json.dumps( obj, @@ -1134,7 +1136,7 @@ class TestJsonNormalizationConsistency: """Verify that hash_object produces hashes identical to directly SHA-256 hashing the canonical tagged-JSON form that _expand_structure produces. - These tests treat BaseSemanticHasher as a black box and anchor its output to + These tests treat SemanticAwarePythonHasher as a black box and anchor its output to a human-verifiable serialization format, ensuring the algorithm is transparent and reproducible without the library. """ @@ -1142,7 +1144,7 @@ class TestJsonNormalizationConsistency: HASHER_ID = "test_v1" @pytest.fixture - def h(self) -> BaseSemanticHasher: + def h(self) -> SemanticAwarePythonHasher: return make_hasher(strict=True) # ------------------------------------------------------------------ @@ -1284,7 +1286,7 @@ def test_no_resolver_uses_obj_content_hash(self): """Without a resolver hash_object returns obj.content_hash() -- using the object's own hasher.""" calling_hasher = make_hasher(strict=True) - obj_hasher = BaseSemanticHasher(hasher_id="obj_hasher_v1") + obj_hasher = SemanticAwarePythonHasher(hasher_id="obj_hasher_v1") rec = SimpleRecord("hello", 1, semantic_hasher=obj_hasher) result = calling_hasher.hash_object(rec) @@ -1294,7 +1296,7 @@ def test_no_resolver_uses_obj_content_hash(self): def test_resolver_overrides_default(self): """When a resolver is provided it takes priority over obj.content_hash().""" calling_hasher = make_hasher(strict=True) - obj_hasher = BaseSemanticHasher(hasher_id="obj_hasher_v1") + obj_hasher = SemanticAwarePythonHasher(hasher_id="obj_hasher_v1") rec = SimpleRecord("hello", 1, semantic_hasher=obj_hasher) # Resolver that uses the calling hasher instead of the object's own hasher @@ -1307,7 +1309,7 @@ def test_resolver_overrides_default(self): def test_resolver_differs_from_no_resolver_when_hashers_differ(self): """When the object's hasher differs from the calling hasher, resolver and no-resolver produce different results.""" - obj_hasher = BaseSemanticHasher(hasher_id="obj_v99") + obj_hasher = SemanticAwarePythonHasher(hasher_id="obj_v99") calling_hasher = make_hasher(strict=True) rec = SimpleRecord("data", 42, semantic_hasher=obj_hasher) @@ -1324,7 +1326,7 @@ def test_resolver_differs_from_no_resolver_when_hashers_differ(self): def test_resolver_propagates_through_list(self): """Resolver is applied to CI objects nested inside a list.""" calling_hasher = make_hasher(strict=True) - obj_hasher = BaseSemanticHasher(hasher_id="inner_v1") + obj_hasher = SemanticAwarePythonHasher(hasher_id="inner_v1") inner = SimpleRecord("inner", 99, semantic_hasher=obj_hasher) # With no resolver the embedded token uses inner's own hasher_id @@ -1344,7 +1346,7 @@ def test_resolver_propagates_through_list(self): def test_resolver_propagates_through_tuple(self): """Resolver is applied to CI objects nested inside a tuple.""" calling_hasher = make_hasher(strict=True) - obj_hasher = BaseSemanticHasher(hasher_id="inner_v1") + obj_hasher = SemanticAwarePythonHasher(hasher_id="inner_v1") inner = SimpleRecord("x", 1, semantic_hasher=obj_hasher) resolver = lambda obj: calling_hasher.hash_object(obj.identity_structure()) @@ -1356,7 +1358,7 @@ def test_resolver_propagates_through_tuple(self): def test_resolver_propagates_through_dict(self): """Resolver is applied to CI objects nested inside a dict value.""" calling_hasher = make_hasher(strict=True) - obj_hasher = BaseSemanticHasher(hasher_id="inner_v1") + obj_hasher = SemanticAwarePythonHasher(hasher_id="inner_v1") inner = SimpleRecord("v", 2, semantic_hasher=obj_hasher) resolver = lambda obj: calling_hasher.hash_object(obj.identity_structure()) @@ -1388,7 +1390,7 @@ def test_resolver_propagates_through_handler_result(self): """When a registered handler returns a ContentIdentifiable, the resolver is applied to that result.""" calling_hasher = make_hasher(strict=True) - obj_hasher = BaseSemanticHasher(hasher_id="inner_v1") + obj_hasher = SemanticAwarePythonHasher(hasher_id="inner_v1") inner = SimpleRecord("inner", 5, semantic_hasher=obj_hasher) resolved = [] @@ -1406,7 +1408,7 @@ def resolver(obj): def test_cached_result_reused_across_calls(self): """content_hash() caches by hasher_id -- the same ContentHash object is returned on repeated calls with the same hasher.""" - obj_hasher = BaseSemanticHasher(hasher_id="cached_v1") + obj_hasher = SemanticAwarePythonHasher(hasher_id="cached_v1") rec = SimpleRecord("y", 5, semantic_hasher=obj_hasher) first = rec.content_hash() @@ -1432,8 +1434,8 @@ class TestUniformHasherPropagation: def test_entry_point_hasher_overrides_nested_hasher(self): """outer.content_hash() uses outer's hasher for inner, even though inner holds a different hasher.""" - hasher_a = BaseSemanticHasher(hasher_id="hasher_a") - hasher_b = BaseSemanticHasher(hasher_id="hasher_b") + hasher_a = SemanticAwarePythonHasher(hasher_id="hasher_a") + hasher_b = SemanticAwarePythonHasher(hasher_id="hasher_b") inner = SimpleRecord("inner", 1, semantic_hasher=hasher_a) outer = NestedRecord("outer", inner, semantic_hasher=hasher_b) @@ -1461,9 +1463,9 @@ def test_entry_point_hasher_overrides_nested_hasher(self): def test_three_level_chain_uses_entry_hasher_throughout(self): """In a three-level chain A→B→C, calling C.content_hash() uses C's hasher for A and B as well, even though each holds a different hasher.""" - hasher_a = BaseSemanticHasher(hasher_id="hasher_a") - hasher_b = BaseSemanticHasher(hasher_id="hasher_b") - hasher_c = BaseSemanticHasher(hasher_id="hasher_c") + hasher_a = SemanticAwarePythonHasher(hasher_id="hasher_a") + hasher_b = SemanticAwarePythonHasher(hasher_id="hasher_b") + hasher_c = SemanticAwarePythonHasher(hasher_id="hasher_c") a = SimpleRecord("a", 1, semantic_hasher=hasher_a) b = NestedRecord("b", a, semantic_hasher=hasher_b) @@ -1494,8 +1496,8 @@ def test_three_level_chain_uses_entry_hasher_throughout(self): def test_independent_call_still_uses_own_hasher(self): """When an intermediate object is called directly (not as part of a larger chain), it uses its own stored hasher as before.""" - hasher_a = BaseSemanticHasher(hasher_id="hasher_a") - hasher_b = BaseSemanticHasher(hasher_id="hasher_b") + hasher_a = SemanticAwarePythonHasher(hasher_id="hasher_a") + hasher_b = SemanticAwarePythonHasher(hasher_id="hasher_b") inner = SimpleRecord("inner", 1, semantic_hasher=hasher_a) outer = NestedRecord("outer", inner, semantic_hasher=hasher_b) @@ -1507,8 +1509,8 @@ def test_independent_call_still_uses_own_hasher(self): def test_cache_keyed_by_hasher_id_avoids_recomputation(self): """The cache is keyed by hasher_id, so a nested object computed under hasher_c is cached and reused on a second call with hasher_c.""" - hasher_a = BaseSemanticHasher(hasher_id="hasher_a") - hasher_c = BaseSemanticHasher(hasher_id="hasher_c") + hasher_a = SemanticAwarePythonHasher(hasher_id="hasher_a") + hasher_c = SemanticAwarePythonHasher(hasher_id="hasher_c") inner = SimpleRecord("inner", 42, semantic_hasher=hasher_a) diff --git a/tests/test_hashing/test_uuid_handler.py b/tests/test_hashing/test_uuid_handler.py index 8b69d78b..3e6fe1f8 100644 --- a/tests/test_hashing/test_uuid_handler.py +++ b/tests/test_hashing/test_uuid_handler.py @@ -1,32 +1,51 @@ -"""Tests for UUIDHandler low-level handle() method behaviour. +"""Tests for UUIDSemanticHasher hash() method behaviour. -Verifies that UUIDHandler returns the 16-byte binary representation of a -UUID, consistent with OrcaPod's canonical ``pa.binary(16)`` Arrow storage -format. +Verifies that UUIDSemanticHasher produces a ContentHash based on the 16-byte +binary representation of a UUID, consistent with OrcaPod's canonical +``pa.binary(16)`` Arrow storage format. """ from __future__ import annotations import uuid as _uuid +from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher +from orcapod.types import ContentHash -def test_uuid_handler_returns_bytes(): - """UUIDHandler should return the 16-byte binary representation.""" - from orcapod.hashing.semantic_hashing.builtin_handlers import UUIDHandler - handler = UUIDHandler() +def _make_hasher() -> SemanticAwarePythonHasher: + from orcapod.hashing.semantic_hashing.builtin_handlers import ( + register_builtin_python_type_semantic_hashers, + ) + from orcapod.hashing.semantic_hashing.type_handler_registry import ( + PythonTypeSemanticHasherRegistry, + ) + + registry = PythonTypeSemanticHasherRegistry() + register_builtin_python_type_semantic_hashers(registry) + return SemanticAwarePythonHasher( + hasher_id="test_v1", type_semantic_hasher_registry=registry, strict=True + ) + + +def test_uuid_handler_returns_content_hash(): + """UUIDSemanticHasher should return a ContentHash for a UUID.""" + hasher = _make_hasher() u = _uuid.UUID("550e8400-e29b-41d4-a716-446655440000") - result = handler.handle(u, hasher=None) # type: ignore[arg-type] - assert result == u.bytes - assert isinstance(result, bytes) - assert len(result) == 16 + result = hasher.hash_object(u) + assert isinstance(result, ContentHash) -def test_uuid_handler_different_uuids_produce_different_bytes(): - """Different UUID values must produce different byte sequences.""" - from orcapod.hashing.semantic_hashing.builtin_handlers import UUIDHandler +def test_uuid_handler_same_uuid_same_hash(): + """Same UUID value produces the same ContentHash.""" + hasher = _make_hasher() + u = _uuid.UUID("550e8400-e29b-41d4-a716-446655440000") + assert hasher.hash_object(u) == hasher.hash_object(u) + - handler = UUIDHandler() +def test_uuid_handler_different_uuids_produce_different_hashes(): + """Different UUID values must produce different ContentHash objects.""" + hasher = _make_hasher() u1 = _uuid.uuid4() u2 = _uuid.uuid4() - assert handler.handle(u1, None) != handler.handle(u2, None) # type: ignore[arg-type] + assert hasher.hash_object(u1) != hasher.hash_object(u2) From 86870eb571e06f5cbd579e4f035ce67a102fbad6 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 18:29:56 +0000 Subject: [PATCH 13/33] =?UTF-8?q?test(semantic=5Fhasher):=20rename=20=5FDu?= =?UTF-8?q?mmyHandler=20=E2=86=92=20=5FDummySemanticHasher,=20fix=20hash()?= =?UTF-8?q?=20return=20type?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Renamed _DummyHandler class to _DummySemanticHasher for clarity - Changed hash() method to return ContentHash via hasher.hash_object() instead of raw string - Updated all 13 usages throughout the test class Co-Authored-By: Claude Sonnet 4.6 --- tests/test_hashing/test_semantic_hasher.py | 37 +++++++++++----------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/tests/test_hashing/test_semantic_hasher.py b/tests/test_hashing/test_semantic_hasher.py index 873db99f..c6584155 100644 --- a/tests/test_hashing/test_semantic_hasher.py +++ b/tests/test_hashing/test_semantic_hasher.py @@ -826,12 +826,13 @@ def test_repr_includes_hash(self, hasher): # --------------------------------------------------------------------------- -class _DummyHandler: +class _DummySemanticHasher: def __init__(self, tag: str) -> None: self.tag = tag def hash(self, obj: Any, hasher: Any) -> Any: - return f"{self.tag}:{obj}" + # Returns a ContentHash by delegating to the outer hasher + return hasher.hash_object(f"{self.tag}:{obj}") class Base: @@ -849,26 +850,26 @@ class GrandChild(Child): class TestPythonTypeSemanticHasherRegistry: def test_register_and_get_exact(self): reg = PythonTypeSemanticHasherRegistry() - h = _DummyHandler("base") + h = _DummySemanticHasher("base") reg.register(Base, h) assert reg.get_semantic_hasher(Base()) is h def test_mro_lookup_child(self): reg = PythonTypeSemanticHasherRegistry() - h = _DummyHandler("base") + h = _DummySemanticHasher("base") reg.register(Base, h) assert reg.get_semantic_hasher(Child()) is h def test_mro_lookup_grandchild(self): reg = PythonTypeSemanticHasherRegistry() - h = _DummyHandler("base") + h = _DummySemanticHasher("base") reg.register(Base, h) assert reg.get_semantic_hasher(GrandChild()) is h def test_more_specific_handler_wins(self): reg = PythonTypeSemanticHasherRegistry() - h_base = _DummyHandler("base") - h_child = _DummyHandler("child") + h_base = _DummySemanticHasher("base") + h_child = _DummySemanticHasher("child") reg.register(Base, h_base) reg.register(Child, h_child) assert reg.get_semantic_hasher(Child()) is h_child @@ -880,7 +881,7 @@ def test_unregistered_returns_none(self): def test_unregister_removes_handler(self): reg = PythonTypeSemanticHasherRegistry() - h = _DummyHandler("base") + h = _DummySemanticHasher("base") reg.register(Base, h) assert reg.unregister(Base) is True assert reg.get_semantic_hasher(Base()) is None @@ -891,8 +892,8 @@ def test_unregister_nonexistent_returns_false(self): def test_replace_existing_handler(self): reg = PythonTypeSemanticHasherRegistry() - h1 = _DummyHandler("first") - h2 = _DummyHandler("second") + h1 = _DummySemanticHasher("first") + h2 = _DummySemanticHasher("second") reg.register(Base, h1) reg.register(Base, h2) assert reg.get_semantic_hasher(Base()) is h2 @@ -900,16 +901,16 @@ def test_replace_existing_handler(self): def test_register_non_type_raises(self): reg = PythonTypeSemanticHasherRegistry() with pytest.raises(TypeError): - reg.register("not_a_type", _DummyHandler("x")) # type: ignore[arg-type] + reg.register("not_a_type", _DummySemanticHasher("x")) # type: ignore[arg-type] def test_has_handler_exact(self): reg = PythonTypeSemanticHasherRegistry() - reg.register(Base, _DummyHandler("b")) + reg.register(Base, _DummySemanticHasher("b")) assert reg.has_semantic_hasher(Base) is True def test_has_handler_via_mro(self): reg = PythonTypeSemanticHasherRegistry() - reg.register(Base, _DummyHandler("b")) + reg.register(Base, _DummySemanticHasher("b")) assert reg.has_semantic_hasher(Child) is True def test_has_handler_false(self): @@ -918,8 +919,8 @@ def test_has_handler_false(self): def test_registered_types_snapshot(self): reg = PythonTypeSemanticHasherRegistry() - reg.register(Base, _DummyHandler("b")) - reg.register(Child, _DummyHandler("c")) + reg.register(Base, _DummySemanticHasher("b")) + reg.register(Child, _DummySemanticHasher("c")) types = reg.registered_types() assert Base in types assert Child in types @@ -927,14 +928,14 @@ def test_registered_types_snapshot(self): def test_len(self): reg = PythonTypeSemanticHasherRegistry() assert len(reg) == 0 - reg.register(Base, _DummyHandler("b")) + reg.register(Base, _DummySemanticHasher("b")) assert len(reg) == 1 - reg.register(Child, _DummyHandler("c")) + reg.register(Child, _DummySemanticHasher("c")) assert len(reg) == 2 def test_get_handler_for_type(self): reg = PythonTypeSemanticHasherRegistry() - h = _DummyHandler("b") + h = _DummySemanticHasher("b") reg.register(Base, h) assert reg.get_semantic_hasher_for_type(Base) is h assert reg.get_semantic_hasher_for_type(Child) is h # via MRO From d1702329f301a5bcddda85c1356d30a560e28fcc Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 18:52:46 +0000 Subject: [PATCH 14/33] feat(visitors): add visit_extension dispatch; rewrite SemanticHashingVisitor for extension types - Add visit_extension() to ArrowTypeDataVisitor with passthrough default - visit() now checks for pa.ExtensionType BEFORE struct check to prevent extension types with struct storage being swallowed by visit_struct - Rewrite SemanticHashingVisitor to use type_converter + python_hasher instead of semantic_registry; resolves extension types via the logical type registry and produces pa.large_binary() tokens of the form ::: - Update StarfixArrowHasher constructor to accept type_converter instead of semantic_registry; python_hasher resolved lazily from context to break the circular dependency in the JSON spec - Update v0.1.json component ordering so type_converter is created before arrow_hasher (which now requires it) - Update versioned_hashers.py, test_starfix_arrow_hasher.py, and test_semantic_registry.py to use the new API - Add tests/test_hashing/test_extension_type_hashing.py with 6 tests covering dispatch routing, hash stability, null passthrough, and binary encoding format Co-Authored-By: Claude Sonnet 4.6 --- src/orcapod/contexts/data/v0.1.json | 18 +- src/orcapod/hashing/arrow_hashers.py | 103 ++++---- src/orcapod/hashing/versioned_hashers.py | 29 +-- src/orcapod/hashing/visitors.py | 229 ++++++++++-------- .../test_extension_type_hashing.py | 121 +++++++++ .../test_hashing/test_starfix_arrow_hasher.py | 5 +- .../test_semantic_registry.py | 50 ++-- 7 files changed, 349 insertions(+), 206 deletions(-) create mode 100644 tests/test_hashing/test_extension_type_hashing.py diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json index 9555b823..a25d6e60 100644 --- a/src/orcapod/contexts/data/v0.1.json +++ b/src/orcapod/contexts/data/v0.1.json @@ -27,15 +27,6 @@ } } }, - "arrow_hasher": { - "_class": "orcapod.hashing.arrow_hashers.StarfixArrowHasher", - "_config": { - "hasher_id": "arrow_v0.1", - "semantic_registry": { - "_ref": "semantic_registry" - } - } - }, "type_converter": { "_class": "orcapod.semantic_types.universal_converter.UniversalTypeConverter", "_config": { @@ -85,6 +76,15 @@ "include_defaults": true } }, + "arrow_hasher": { + "_class": "orcapod.hashing.arrow_hashers.StarfixArrowHasher", + "_config": { + "hasher_id": "arrow_v0.1", + "type_converter": { + "_ref": "type_converter" + } + } + }, "type_handler_registry": { "_class": "orcapod.hashing.semantic_hashing.type_handler_registry.PythonTypeSemanticHasherRegistry", "_config": { diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index f0931cdf..3f306c7a 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -1,7 +1,7 @@ import hashlib import json from collections.abc import Callable -from typing import Any +from typing import TYPE_CHECKING, Any import pyarrow as pa from starfix import ArrowDigester @@ -13,6 +13,10 @@ from orcapod.types import ContentHash from orcapod.utils import arrow_utils +if TYPE_CHECKING: + from orcapod.semantic_types.universal_converter import UniversalTypeConverter + from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher + SERIALIZATION_METHOD_LUT: dict[str, Callable[[pa.Table], bytes]] = { "logical": arrow_serialization.serialize_table_logical, } @@ -97,11 +101,10 @@ def hasher_id(self) -> str: return self._hasher_id def _process_table_columns(self, table: pa.Table | pa.RecordBatch) -> pa.Table: - """ - Process table columns using visitor pattern to handle nested semantic types. + """Process table columns using the semantic registry to hash struct-typed semantic columns. - This replaces the old column-by-column processing with a visitor-based approach - that can handle semantic types nested inside complex data structures. + Traverses each column and replaces recognised semantic struct types (detected by + struct signature via ``SemanticTypeRegistry``) with their content-hash strings. """ # TODO: Process in batchwise/chunk-wise fashion for memory efficiency # Currently using to_pylist() for simplicity but this loads entire table into memory @@ -109,36 +112,28 @@ def _process_table_columns(self, table: pa.Table | pa.RecordBatch) -> pa.Table: new_columns = [] new_fields = [] - # Import here to avoid circular dependencies for i, field in enumerate(table.schema): - # Convert column to struct dicts for processing column_data = table.column(i).to_pylist() - # TODO: verify the functioning of the visitor pattern - # Create fresh visitor for each column (stateless approach) - visitor = SemanticHashingVisitor(self.semantic_registry) - try: - # Use visitor to transform both type and data - new_type = None - processed_data = [] - for c in column_data: - processed_type, processed_value = visitor.visit(field.type, c) - if new_type is None: - new_type = processed_type - processed_data.append(processed_value) - - # Create new Arrow column from processed data - assert new_type is not None, "Failed to infer new column type" - # TODO: revisit this logic - new_column = pa.array(processed_data, type=new_type) - new_field = pa.field(field.name, new_type) - - new_columns.append(new_column) - new_fields.append(new_field) + if pa.types.is_struct(field.type): + converter = self.semantic_registry.get_converter_for_struct_signature(field.type) + if converter is not None: + # Semantic struct — replace with hash strings + processed_data = [ + converter.hash_struct_dict(row) if row is not None else None + for row in column_data + ] + new_type = pa.large_string() + new_columns.append(pa.array(processed_data, type=new_type)) + new_fields.append(pa.field(field.name, new_type)) + continue + + # Not a semantic type — pass through unchanged + new_columns.append(table.column(i)) + new_fields.append(field) except Exception as e: - # Add context about which column failed raise RuntimeError( f"Failed to process column '{field.name}': {str(e)}" ) from e @@ -248,11 +243,10 @@ class StarfixArrowHasher: Pipeline -------- 1. **Semantic pre-processing** — the ``SemanticHashingVisitor`` traverses - every column and replaces recognised semantic types (e.g. ``Path`` - structs) with their content-addressed hash strings. This step runs - before the Arrow bytes are ever touched by starfix, so the final hash - captures *file content* for path-typed columns rather than the raw - path string. + every column and replaces recognised extension-typed columns (e.g. ``Path``) + with their content-addressed hash bytes. This step runs before the Arrow + bytes are ever touched by starfix, so the final hash captures *file content* + for path-typed columns rather than the raw path string. 2. **Starfix hashing** — ``ArrowDigester.hash_table`` (or ``ArrowDigester.hash_schema``) is called on the pre-processed table / schema. The digester is column-order-independent and normalises @@ -262,8 +256,12 @@ class StarfixArrowHasher: Parameters ---------- - semantic_registry: - Registry of semantic type converters used during pre-processing. + type_converter: + ``UniversalTypeConverter`` used by ``SemanticHashingVisitor`` to resolve + Arrow extension types to Python types and convert storage values. + python_hasher: + ``SemanticAwarePythonHasher`` used by ``SemanticHashingVisitor`` to hash + Python objects produced from extension-typed columns. hasher_id: String identifier embedded in every ``ContentHash`` produced by this hasher. Bump this value whenever the hash algorithm changes @@ -272,26 +270,45 @@ class StarfixArrowHasher: def __init__( self, - semantic_registry: SemanticTypeRegistry, + type_converter: "UniversalTypeConverter", hasher_id: str, + python_hasher: "SemanticAwarePythonHasher | None" = None, ) -> None: self._hasher_id = hasher_id - self.semantic_registry = semantic_registry + self._type_converter = type_converter + self._python_hasher = python_hasher @property def hasher_id(self) -> str: return self._hasher_id + def _get_python_hasher(self) -> "SemanticAwarePythonHasher": + """Return the python_hasher, lazily resolving from default context if not set. + + Lazy resolution breaks the circular dependency that would arise if ``arrow_hasher`` + were constructed before ``semantic_hasher`` in the context JSON spec (which is the + natural order since ``type_handler_registry`` references ``arrow_hasher`` for + ``ArrowTableSemanticHasher``). + """ + if self._python_hasher is not None: + return self._python_hasher + from orcapod.contexts import get_default_context + return get_default_context().semantic_hasher # type: ignore[return-value] + def _process_table_columns(self, table: pa.Table | pa.RecordBatch) -> pa.Table: - """Replace semantic-typed columns with their content-hash strings.""" + """Replace extension-typed columns with their content-hash bytes.""" new_columns: list[pa.Array] = [] new_fields: list[pa.Field] = [] + python_hasher = self._get_python_hasher() + for i, field in enumerate(table.schema): - # Short-circuit: primitive columns cannot contain semantic types, so skip - # the costly Python round-trip and reuse the original Arrow array directly. + # Short-circuit: primitive columns (non-extension, non-struct, non-list, non-map) + # cannot contain extension semantic types, so skip the costly Python round-trip + # and reuse the original Arrow array directly. if not ( - pa.types.is_struct(field.type) + isinstance(field.type, pa.ExtensionType) + or pa.types.is_struct(field.type) or pa.types.is_list(field.type) or pa.types.is_large_list(field.type) or pa.types.is_fixed_size_list(field.type) @@ -302,7 +319,7 @@ def _process_table_columns(self, table: pa.Table | pa.RecordBatch) -> pa.Table: continue column_data = table.column(i).to_pylist() - visitor = SemanticHashingVisitor(self.semantic_registry) + visitor = SemanticHashingVisitor(self._type_converter, python_hasher) try: new_type: pa.DataType | None = None diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index b12bd2d3..080cbec6 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -94,10 +94,10 @@ def get_versioned_semantic_arrow_hasher( hasher_id: str = _CURRENT_ARROW_HASHER_ID, ) -> hp.ArrowHasherProtocol: """ - Return a SemanticArrowHasher configured for the current version. + Return a StarfixArrowHasher configured for the current version. The arrow hasher handles Arrow table / RecordBatch hashing with - semantic-type awareness (e.g. Path columns are hashed by file content). + extension-type awareness (e.g. Path columns are hashed by file content). Parameters ---------- @@ -107,34 +107,19 @@ def get_versioned_semantic_arrow_hasher( Returns ------- ArrowHasherProtocol - A fully configured SemanticArrowHasher instance. + A fully configured StarfixArrowHasher instance. """ + from orcapod.contexts import get_default_context from orcapod.hashing.arrow_hashers import StarfixArrowHasher - from orcapod.hashing.file_hashers import BasicFileHasher - from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry - from orcapod.semantic_types.semantic_struct_converters import ( - PythonPathStructConverter, - UUIDStructConverter, - ) - # Build a default semantic registry populated with the standard converters. - # We use Any-typed locals here to side-step type-checker false positives - # that arise from the protocol definition of SemanticStructConverterProtocol having - # a slightly different hash_struct_dict signature than the concrete class. - registry: Any = SemanticTypeRegistry() - file_hasher = BasicFileHasher(algorithm="sha256") - path_converter: Any = PythonPathStructConverter(file_hasher=file_hasher) - registry.register_converter("path", path_converter) - uuid_converter: Any = UUIDStructConverter() - registry.register_converter("uuid", uuid_converter) + ctx = get_default_context() logger.debug( "get_versioned_semantic_arrow_hasher: creating StarfixArrowHasher " "(hasher_id=%r)", hasher_id, ) - hasher: Any = StarfixArrowHasher( + return StarfixArrowHasher( hasher_id=hasher_id, - semantic_registry=registry, + type_converter=ctx.type_converter, ) - return hasher diff --git a/src/orcapod/hashing/visitors.py b/src/orcapod/hashing/visitors.py index f3a6fe50..b257f7f6 100644 --- a/src/orcapod/hashing/visitors.py +++ b/src/orcapod/hashing/visitors.py @@ -1,10 +1,5 @@ """ -SUGGESTED FILE: src/orcapod/hashing/visitors.py - Generic visitor pattern for traversing Arrow types and data simultaneously. - -This provides a base visitor class that can be extended for various processing needs -like semantic hashing, validation, data cleaning, etc. """ from __future__ import annotations @@ -12,68 +7,98 @@ from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Any -from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: import pyarrow as pa + from orcapod.semantic_types.universal_converter import UniversalTypeConverter + from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher else: pa = LazyModule("pyarrow") class ArrowTypeDataVisitor(ABC): - """ - Base visitor for traversing Arrow types and data simultaneously. - - This enables processing that needs to transform both the Arrow schema - and the corresponding data in a single pass. - """ + """Base visitor for traversing Arrow types and data simultaneously.""" @abstractmethod def visit_struct( self, struct_type: "pa.StructType", data: dict | None ) -> tuple["pa.DataType", Any]: - """Visit a struct type with its data""" + """Visit a struct type with its data.""" pass @abstractmethod def visit_list( self, list_type: "pa.ListType", data: list | None ) -> tuple["pa.DataType", Any]: - """Visit a list type with its data""" + """Visit a list type with its data.""" pass @abstractmethod def visit_map( self, map_type: "pa.MapType", data: dict | None ) -> tuple["pa.DataType", Any]: - """Visit a map type with its data""" + """Visit a map type with its data.""" pass @abstractmethod def visit_primitive( self, primitive_type: "pa.DataType", data: Any ) -> tuple["pa.DataType", Any]: - """Visit a primitive type with its data""" + """Visit a primitive type with its data.""" pass - def visit(self, arrow_type: "pa.DataType", data: Any) -> tuple["pa.DataType", Any]: + def visit_extension( + self, + extension_type: "pa.ExtensionType", + storage_value: Any, + ) -> tuple["pa.DataType", Any]: + """Handle an Arrow extension type. + + Default implementation: passthrough — preserves the extension type and its + storage value unchanged so that the downstream ``StarfixArrowHasher`` / + ``ArrowDigester`` sees the full extension metadata when it receives the + pre-processed table. + + Subclasses may override to convert recognised extension types to a hashed + ``pa.large_binary()`` value. + + Args: + extension_type: The Arrow extension type. + storage_value: The storage-level value (result of ``to_pylist()`` on the column). + + Returns: + Tuple of ``(new_arrow_type, new_data)``. """ - Main dispatch method that routes to appropriate visit method. + return extension_type, storage_value + + def visit(self, arrow_type: "pa.DataType", data: Any) -> tuple["pa.DataType", Any]: + """Main dispatch method that routes to the appropriate visit method. + + Extension types are checked **first** — before the struct check — because + extension types with struct storage would otherwise be incorrectly routed + into ``visit_struct``. After ``visit_extension``, the result is re-visited + only if the type changed AND is no longer an extension type (enables + composability, avoids infinite recursion). Args: - arrow_type: Arrow data type to process - data: Corresponding data value + arrow_type: Arrow data type to process. + data: Corresponding data value. Returns: - Tuple of (new_arrow_type, new_data) + Tuple of ``(new_arrow_type, new_data)``. """ + if isinstance(arrow_type, pa.ExtensionType): + new_type, new_data = self.visit_extension(arrow_type, data) + if new_type is not arrow_type and not isinstance(new_type, pa.ExtensionType): + return self.visit(new_type, new_data) + return new_type, new_data + if pa.types.is_struct(arrow_type): return self.visit_struct(arrow_type, data) elif pa.types.is_list(arrow_type) or pa.types.is_large_list(arrow_type): return self.visit_list(arrow_type, data) elif pa.types.is_fixed_size_list(arrow_type): - # Treat fixed-size lists like regular lists for processing return self.visit_list(arrow_type, data) elif pa.types.is_map(arrow_type): return self.visit_map(arrow_type, data) @@ -83,11 +108,7 @@ def visit(self, arrow_type: "pa.DataType", data: Any) -> tuple["pa.DataType", An def _visit_struct_fields( self, struct_type: "pa.StructType", data: dict | None ) -> tuple["pa.StructType", dict]: - """ - Helper method to recursively process struct fields. - - This is the default behavior for regular (non-semantic) structs. - """ + """Recursively process struct fields. Default behavior for regular structs.""" if data is None: return struct_type, None @@ -97,7 +118,6 @@ def _visit_struct_fields( for field in struct_type: field_data = data.get(field.name) new_field_type, new_field_data = self.visit(field.type, field_data) - new_fields.append(pa.field(field.name, new_field_type)) new_data[field.name] = new_field_data @@ -106,11 +126,7 @@ def _visit_struct_fields( def _visit_list_elements( self, list_type: "pa.ListType", data: list | None ) -> tuple["pa.DataType", list]: - """ - Helper method to recursively process list elements. - - This is the default behavior for lists. - """ + """Recursively process list elements.""" if data is None: return list_type, None @@ -121,16 +137,12 @@ def _visit_list_elements( for item in data: current_element_type, processed_item = self.visit(element_type, item) processed_elements.append(processed_item) - - # Use the first non-None element to determine new element type if new_element_type is None: new_element_type = current_element_type - # If list was empty or all None, keep original element type if new_element_type is None: new_element_type = element_type - # Create appropriate list type based on original type if pa.types.is_large_list(list_type): return pa.large_list(new_element_type), processed_elements elif pa.types.is_fixed_size_list(list_type): @@ -140,77 +152,99 @@ def _visit_list_elements( class SemanticHashingError(Exception): - """Exception raised when semantic hashing fails""" - + """Exception raised when semantic hashing fails.""" pass class SemanticHashingVisitor(ArrowTypeDataVisitor): + """Visitor that replaces extension-typed columns with their content hashes. + + For each Arrow column whose type is a ``pa.ExtensionType``: + + 1. Look up the corresponding Python type via ``type_converter``. + 2. If the Python type has a semantic hasher registered in ``python_hasher``, + convert the storage value to a Python object and hash it, replacing the + column with a ``pa.large_binary()`` value of the form:: + + + b"::" + content_hash.to_prefixed_digest() + + where ``type_name`` is the extension name with dots replaced by colons + (e.g. ``"orcapod.path"`` → ``"orcapod:path"``), and + ``to_prefixed_digest()`` = ``method_bytes + b":" + digest``. + 3. If no hasher is registered (or the converter doesn't know the type), + return the extension type and storage value unchanged. The downstream + ``StarfixArrowHasher`` / ``ArrowDigester`` will see the full extension + metadata intact and hash it in a type-aware way. + + Args: + type_converter: The active ``UniversalTypeConverter`` for resolving + extension type → Python type and storage → Python conversion. + python_hasher: The active ``SemanticAwarePythonHasher`` for hashing + Python objects. """ - Visitor that replaces semantic types with their hash strings. - This visitor traverses Arrow type structures and data simultaneously, - identifying semantic types by their struct signatures and replacing - them with hash strings computed by their respective converters. - """ - - def __init__(self, semantic_registry: SemanticTypeRegistry): - """ - Initialize the semantic hashing visitor. - - Args: - semantic_registry: Registry containing semantic type converters - """ - self.registry = semantic_registry + def __init__( + self, + type_converter: "UniversalTypeConverter", + python_hasher: "SemanticAwarePythonHasher", + ) -> None: + self._type_converter = type_converter + self._python_hasher = python_hasher self._current_field_path: list[str] = [] + def visit_extension( + self, + extension_type: "pa.ExtensionType", + storage_value: Any, + ) -> tuple["pa.DataType", Any]: + """Hash an extension type value to pa.large_binary(), or passthrough.""" + if storage_value is None: + return extension_type, None + + from typing import Any as _Any + + # Resolve extension type → Python type. + python_type = self._type_converter.arrow_type_to_python_type(extension_type) + + # If the converter couldn't resolve to a concrete class, passthrough. + if python_type is _Any or not isinstance(python_type, type): + return extension_type, storage_value + + # Only hash if a semantic hasher is registered for this Python type. + if not self._python_hasher.type_semantic_hasher_registry.has_semantic_hasher( + python_type + ): + return extension_type, storage_value + + # Convert storage value → Python object and hash it. + python_obj = self._type_converter.storage_to_python(storage_value, python_type) + content_hash = self._python_hasher.hash_object(python_obj) + + # Encode as binary: ":::" + # Dots in the extension name → colons (e.g. "orcapod.path" → "orcapod:path"). + # The "::" separator is unambiguous because to_prefixed_digest() uses only ":". + type_name = extension_type.extension_name.replace(".", ":") + hash_bytes = ( + type_name.encode("ascii") + + b"::" + + content_hash.to_prefixed_digest() + ) + return pa.large_binary(), hash_bytes + def visit_struct( self, struct_type: "pa.StructType", data: dict | None ) -> tuple["pa.DataType", Any]: - """ - Visit a struct type, checking if it's a semantic type. - - If the struct is a semantic type (recognized by signature), replace it - with a hash string. Otherwise, recursively process its fields. - """ + """Regular struct (no extension identity) — recurse into fields.""" if data is None: return struct_type, None - - # Check if this struct IS a semantic type by signature recognition - converter = self.registry.get_converter_for_struct_signature(struct_type) - if converter: - # This is a semantic type - hash it - try: - hash_string = converter.hash_struct_dict(data) - return pa.large_string(), hash_string - except Exception as e: - field_path = ( - ".".join(self._current_field_path) - if self._current_field_path - else "" - ) - converter_name = getattr( - converter, "semantic_type_name", str(type(converter).__name__) - ) - raise SemanticHashingError( - f"Failed to hash semantic type '{converter_name}' at field path '{field_path}': {str(e)}" - ) from e - else: - # Regular struct - recursively process fields - return self._visit_struct_fields(struct_type, data) + return self._visit_struct_fields(struct_type, data) def visit_list( self, list_type: "pa.ListType", data: list | None ) -> tuple["pa.DataType", Any]: - """ - Visit a list type, recursively processing elements. - - Elements that are semantic types will be replaced with hash strings. - """ + """Recurse into list elements.""" if data is None: return list_type, None - - # Add list indicator to field path for error context self._current_field_path.append("[*]") try: return self._visit_list_elements(list_type, data) @@ -220,28 +254,19 @@ def visit_list( def visit_map( self, map_type: "pa.MapType", data: dict | None ) -> tuple["pa.DataType", Any]: - """ - Visit a map type. - - For now, we treat maps as pass-through since they're less common. - TODO: Implement proper map traversal if needed for semantic types in keys/values. - """ + """Pass map types through unchanged.""" return map_type, data def visit_primitive( self, primitive_type: "pa.DataType", data: Any ) -> tuple["pa.DataType", Any]: - """ - Visit a primitive type - pass through unchanged. - - Primitive types cannot be semantic types (which are always structs). - """ + """Pass primitive types through unchanged.""" return primitive_type, data def _visit_struct_fields( self, struct_type: "pa.StructType", data: dict | None ) -> tuple["pa.StructType", dict]: - """Override to add field path tracking for better error messages""" + """Override to add field path tracking for better error messages.""" if data is None: return struct_type, None @@ -249,12 +274,10 @@ def _visit_struct_fields( new_data = {} for field in struct_type: - # Add field name to path for error context self._current_field_path.append(field.name) try: field_data = data.get(field.name) new_field_type, new_field_data = self.visit(field.type, field_data) - new_fields.append(pa.field(field.name, new_field_type)) new_data[field.name] = new_field_data finally: diff --git a/tests/test_hashing/test_extension_type_hashing.py b/tests/test_hashing/test_extension_type_hashing.py new file mode 100644 index 00000000..8c45bf7f --- /dev/null +++ b/tests/test_hashing/test_extension_type_hashing.py @@ -0,0 +1,121 @@ +"""Tests for extension type column hashing via SemanticHashingVisitor.""" + +from __future__ import annotations + +import pyarrow as pa +import pytest +from pathlib import Path + +from orcapod.hashing.visitors import SemanticHashingVisitor +from orcapod.contexts import get_default_context + + +@pytest.fixture +def ctx(): + return get_default_context() + + +class TestArrowTypeDataVisitorExtension: + def test_visit_dispatches_to_visit_extension_for_extension_types(self, ctx): + """visit() routes ExtensionType columns to visit_extension(), not visit_struct().""" + arrow_type = ctx.type_converter.register_python_class(Path) + assert isinstance(arrow_type, pa.ExtensionType), ( + "Path must be registered as an Arrow extension type" + ) + + calls = [] + + class TrackingVisitor(SemanticHashingVisitor): + def visit_extension(self, ext_type, storage_value): + calls.append("visit_extension") + # Don't call super() here — just passthrough to avoid hashing a + # non-existent path. This test only verifies dispatch routing. + return ext_type, storage_value + + def visit_struct(self, struct_type, data): + calls.append("visit_struct") + return super().visit_struct(struct_type, data) + + visitor = TrackingVisitor(ctx.type_converter, ctx.semantic_hasher) + # Any value is fine for this dispatch test — use a dummy string (storage for Path is str) + visitor.visit(arrow_type, "/tmp/dummy") + assert "visit_extension" in calls + assert "visit_struct" not in calls + + +class TestSemanticHashingVisitorExtension: + def test_path_column_hashed_to_large_binary(self, ctx, tmp_path): + """Path extension columns are replaced with pa.large_binary() hash tokens.""" + file = tmp_path / "test.txt" + file.write_text("hello") + + arrow_type = ctx.type_converter.register_python_class(Path) + storage_val = ctx.type_converter.python_to_storage(Path(file), Path) + + visitor = SemanticHashingVisitor(ctx.type_converter, ctx.semantic_hasher) + new_type, new_data = visitor.visit(arrow_type, storage_val) + + assert new_type == pa.large_binary() + assert isinstance(new_data, bytes) + + def test_same_content_same_hash(self, ctx, tmp_path): + """Two paths pointing to files with identical content produce the same hash bytes.""" + file1 = tmp_path / "a.txt" + file2 = tmp_path / "b.txt" + file1.write_text("identical content") + file2.write_text("identical content") + + arrow_type = ctx.type_converter.register_python_class(Path) + storage1 = ctx.type_converter.python_to_storage(Path(file1), Path) + storage2 = ctx.type_converter.python_to_storage(Path(file2), Path) + + visitor = SemanticHashingVisitor(ctx.type_converter, ctx.semantic_hasher) + _, hash1 = visitor.visit(arrow_type, storage1) + _, hash2 = visitor.visit(arrow_type, storage2) + + assert hash1 == hash2 + + def test_different_content_different_hash(self, ctx, tmp_path): + """Files with different content produce different hash bytes.""" + file1 = tmp_path / "x.txt" + file2 = tmp_path / "y.txt" + file1.write_text("content A") + file2.write_text("content B") + + arrow_type = ctx.type_converter.register_python_class(Path) + storage1 = ctx.type_converter.python_to_storage(Path(file1), Path) + storage2 = ctx.type_converter.python_to_storage(Path(file2), Path) + + visitor = SemanticHashingVisitor(ctx.type_converter, ctx.semantic_hasher) + _, hash1 = visitor.visit(arrow_type, storage1) + _, hash2 = visitor.visit(arrow_type, storage2) + + assert hash1 != hash2 + + def test_binary_encoding_format(self, ctx, tmp_path): + """Hash bytes have format b':::'.""" + file = tmp_path / "test.txt" + file.write_text("test") + + arrow_type = ctx.type_converter.register_python_class(Path) + storage_val = ctx.type_converter.python_to_storage(Path(file), Path) + + visitor = SemanticHashingVisitor(ctx.type_converter, ctx.semantic_hasher) + _, hash_bytes = visitor.visit(arrow_type, storage_val) + + assert b"::" in hash_bytes + type_prefix, hash_part = hash_bytes.split(b"::", 1) + # Extension name "orcapod.path" → dots replaced with colons + assert type_prefix == b"orcapod:path" + # hash_part should be "method:digest" — at least one colon + assert b":" in hash_part + + def test_null_value_passthrough(self, ctx): + """Null storage values pass through as-is.""" + arrow_type = ctx.type_converter.register_python_class(Path) + + visitor = SemanticHashingVisitor(ctx.type_converter, ctx.semantic_hasher) + new_type, new_data = visitor.visit(arrow_type, None) + + assert new_type == arrow_type + assert new_data is None diff --git a/tests/test_hashing/test_starfix_arrow_hasher.py b/tests/test_hashing/test_starfix_arrow_hasher.py index 77e52f76..0c6ff67e 100644 --- a/tests/test_hashing/test_starfix_arrow_hasher.py +++ b/tests/test_hashing/test_starfix_arrow_hasher.py @@ -27,12 +27,12 @@ import pytest import pyarrow as pa +from orcapod.contexts import get_default_context from orcapod.hashing.arrow_hashers import StarfixArrowHasher from orcapod.hashing.versioned_hashers import ( _CURRENT_ARROW_HASHER_ID, get_versioned_semantic_arrow_hasher, ) -from orcapod.semantic_types import SemanticTypeRegistry from orcapod.types import ContentHash @@ -46,8 +46,9 @@ def _make_hasher() -> StarfixArrowHasher: + ctx = get_default_context() return StarfixArrowHasher( - semantic_registry=SemanticTypeRegistry(), + type_converter=ctx.type_converter, hasher_id=HASHER_ID, ) diff --git a/tests/test_semantic_types/test_semantic_registry.py b/tests/test_semantic_types/test_semantic_registry.py index 82df93e0..fd044ff5 100644 --- a/tests/test_semantic_types/test_semantic_registry.py +++ b/tests/test_semantic_types/test_semantic_registry.py @@ -132,39 +132,35 @@ def test_integration_with_converter(): assert retrieved is converter -def test_uuid_type_registered_in_default_registry(): - """uuid.UUID should be registered and map to pa.struct([pa.field('uuid', pa.binary(16))]).""" - from orcapod.hashing.versioned_hashers import get_versioned_semantic_arrow_hasher - - hasher = get_versioned_semantic_arrow_hasher() - registry = hasher.semantic_registry - converter = registry.get_converter_for_python_type(uuid.UUID) - assert converter is not None - assert converter.arrow_struct_type == pa.struct([pa.field("uuid", pa.binary(16))]) +def test_uuid_type_registered_in_default_context(): + """uuid.UUID should be registered as an Arrow extension type in the default context.""" + from orcapod.contexts import get_default_context + + ctx = get_default_context() + arrow_type = ctx.type_converter.register_python_class(uuid.UUID) + assert isinstance(arrow_type, pa.ExtensionType), ( + "uuid.UUID must be registered as an Arrow extension type" + ) -def test_uuid_struct_resolves_to_converter(): - """pa.struct([pa.field('uuid', pa.binary(16))]) should resolve back to a converter for uuid.UUID.""" - from orcapod.hashing.versioned_hashers import get_versioned_semantic_arrow_hasher +def test_uuid_extension_type_resolves_to_python_type(): + """The Arrow extension type for UUID should resolve back to uuid.UUID.""" + from orcapod.contexts import get_default_context - hasher = get_versioned_semantic_arrow_hasher() - registry = hasher.semantic_registry - converter = registry.get_converter_for_struct_signature( - pa.struct([pa.field("uuid", pa.binary(16))]) - ) - assert converter is not None - assert converter.python_type is uuid.UUID + ctx = get_default_context() + arrow_type = ctx.type_converter.register_python_class(uuid.UUID) + python_type = ctx.type_converter.arrow_type_to_python_type(arrow_type) + assert python_type is uuid.UUID -def test_uuid_semantic_type_name_registered(): - """Converter registered under the name 'uuid'.""" - from orcapod.hashing.versioned_hashers import get_versioned_semantic_arrow_hasher +def test_uuid_extension_name(): + """The UUID extension type should have the expected extension name.""" + from orcapod.contexts import get_default_context - hasher = get_versioned_semantic_arrow_hasher() - registry = hasher.semantic_registry - converter = registry.get_converter_for_semantic_type("uuid") - assert converter is not None - assert converter.python_type is uuid.UUID + ctx = get_default_context() + arrow_type = ctx.type_converter.register_python_class(uuid.UUID) + assert isinstance(arrow_type, pa.ExtensionType) + assert "uuid" in arrow_type.extension_name.lower() # Comprehensive unregister tests for future implementation From 6bab2f43fdf6624de6ad8bba6cdaaf1a7d007ec6 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 18:57:41 +0000 Subject: [PATCH 15/33] fix(visitors): use real file in dispatch test, remove deferred typing import - Fix test_visit_dispatches_to_visit_extension_for_extension_types to use a real file (via tmp_path fixture) and call super() in visit_extension to validate the full dispatch chain - Move deferred 'from typing import Any' to module-level import at top of visitors.py and use typing.Any in visit_extension method Co-Authored-By: Claude Sonnet 4.6 --- src/orcapod/hashing/visitors.py | 5 ++--- tests/test_hashing/test_extension_type_hashing.py | 14 ++++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/orcapod/hashing/visitors.py b/src/orcapod/hashing/visitors.py index b257f7f6..72015ebf 100644 --- a/src/orcapod/hashing/visitors.py +++ b/src/orcapod/hashing/visitors.py @@ -5,6 +5,7 @@ from __future__ import annotations from abc import ABC, abstractmethod +import typing from typing import TYPE_CHECKING, Any from orcapod.utils.lazy_module import LazyModule @@ -201,13 +202,11 @@ def visit_extension( if storage_value is None: return extension_type, None - from typing import Any as _Any - # Resolve extension type → Python type. python_type = self._type_converter.arrow_type_to_python_type(extension_type) # If the converter couldn't resolve to a concrete class, passthrough. - if python_type is _Any or not isinstance(python_type, type): + if python_type is typing.Any or not isinstance(python_type, type): return extension_type, storage_value # Only hash if a semantic hasher is registered for this Python type. diff --git a/tests/test_hashing/test_extension_type_hashing.py b/tests/test_hashing/test_extension_type_hashing.py index 8c45bf7f..f371ef9b 100644 --- a/tests/test_hashing/test_extension_type_hashing.py +++ b/tests/test_hashing/test_extension_type_hashing.py @@ -16,29 +16,31 @@ def ctx(): class TestArrowTypeDataVisitorExtension: - def test_visit_dispatches_to_visit_extension_for_extension_types(self, ctx): + def test_visit_dispatches_to_visit_extension_for_extension_types(self, ctx, tmp_path): """visit() routes ExtensionType columns to visit_extension(), not visit_struct().""" + # Create a real file so visit_extension can complete without errors + real_file = tmp_path / "dummy.txt" + real_file.write_text("dispatch test") + arrow_type = ctx.type_converter.register_python_class(Path) assert isinstance(arrow_type, pa.ExtensionType), ( "Path must be registered as an Arrow extension type" ) + storage_val = ctx.type_converter.python_to_storage(Path(real_file), Path) calls = [] class TrackingVisitor(SemanticHashingVisitor): def visit_extension(self, ext_type, storage_value): calls.append("visit_extension") - # Don't call super() here — just passthrough to avoid hashing a - # non-existent path. This test only verifies dispatch routing. - return ext_type, storage_value + return super().visit_extension(ext_type, storage_value) def visit_struct(self, struct_type, data): calls.append("visit_struct") return super().visit_struct(struct_type, data) visitor = TrackingVisitor(ctx.type_converter, ctx.semantic_hasher) - # Any value is fine for this dispatch test — use a dummy string (storage for Path is str) - visitor.visit(arrow_type, "/tmp/dummy") + visitor.visit(arrow_type, storage_val) assert "visit_extension" in calls assert "visit_struct" not in calls From aaa307017934bd926af782dffd871ca9db820528 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 19:01:48 +0000 Subject: [PATCH 16/33] refactor(arrow_hashers): delete SemanticArrowHasher, finalize StarfixArrowHasher constructor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Deleted SemanticArrowHasher class (old struct-based arrow hasher) - Renamed python_hasher parameter to semantic_hasher (required positional) - Removed lazy resolution logic (_get_python_hasher) — semantic_hasher is now required - Removed unused imports: arrow_serialization, arrow_utils, SemanticTypeRegistry Co-Authored-By: Claude Sonnet 4.6 --- src/orcapod/hashing/arrow_hashers.py | 361 +++------------------------ 1 file changed, 35 insertions(+), 326 deletions(-) diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index 3f306c7a..f568cac1 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -1,311 +1,70 @@ -import hashlib -import json -from collections.abc import Callable +from __future__ import annotations + from typing import TYPE_CHECKING, Any import pyarrow as pa from starfix import ArrowDigester -from orcapod.hashing import arrow_serialization from orcapod.hashing.schema_cleaner import clean_schema_for_hashing, has_extension_metadata from orcapod.hashing.visitors import SemanticHashingVisitor -from orcapod.semantic_types import SemanticTypeRegistry from orcapod.types import ContentHash -from orcapod.utils import arrow_utils if TYPE_CHECKING: from orcapod.semantic_types.universal_converter import UniversalTypeConverter from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher -SERIALIZATION_METHOD_LUT: dict[str, Callable[[pa.Table], bytes]] = { - "logical": arrow_serialization.serialize_table_logical, -} - - -def json_pyarrow_table_serialization(table: pa.Table) -> str: - """ - Serialize a PyArrow table to a stable JSON string by converting to dictionary of lists. - - Args: - table: PyArrow table to serialize - - Returns: - JSON string representation with sorted keys and no whitespace - """ - # Convert table to dictionary of lists using to_pylist() - data_dict = {} - - for column_name in table.column_names: - # Convert Arrow column to Python list, which visits all elements - data_dict[column_name] = table.column(column_name).to_pylist() - - # Serialize to JSON with sorted keys and no whitespace - return json.dumps( - data_dict, - separators=(",", ":"), - sort_keys=True, - ) - - -class SemanticArrowHasher: - """ - Stable hasher for Arrow tables with semantic type support. - - This hasher: - 1. Uses visitor pattern to recursively process nested data structures - 2. Replaces semantic types with their hash strings using registered converters - 3. Sorts columns by name for deterministic ordering - 4. Uses Arrow serialization for stable binary representation - 5. Computes final hash of the processed table - """ - - def __init__( - self, - semantic_registry: SemanticTypeRegistry, - hasher_id: str | None = None, - hash_algorithm: str = "sha256", - chunk_size: int = 8192, - handle_missing: str = "error", - serialization_method: str = "logical", - # TODO: consider passing options for serialization method - ): - """ - Initialize SemanticArrowHasher. - - Args: - semantic_registry: Registry containing semantic type converters with hashing - hash_algorithm: Hash algorithm to use for final table hash - chunk_size: Size of chunks to read files in bytes (legacy, may be removed) - hasher_id: Unique identifier for this hasher instance - handle_missing: How to handle missing files ('error', 'skip', 'null_hash') - serialization_method: Method for serializing Arrow table - """ - if hasher_id is None: - hasher_id = f"semantic_arrow_hasher:{hash_algorithm}:{serialization_method}" - - self._hasher_id = hasher_id - self.semantic_registry = semantic_registry - self.chunk_size = chunk_size - self.handle_missing = handle_missing - self.hash_algorithm = hash_algorithm - - if serialization_method not in SERIALIZATION_METHOD_LUT: - raise ValueError( - f"Invalid serialization method '{serialization_method}'. " - f"Supported methods: {list(SERIALIZATION_METHOD_LUT.keys())}" - ) - self.serialization_method = serialization_method - - @property - def hasher_id(self) -> str: - return self._hasher_id - - def _process_table_columns(self, table: pa.Table | pa.RecordBatch) -> pa.Table: - """Process table columns using the semantic registry to hash struct-typed semantic columns. - - Traverses each column and replaces recognised semantic struct types (detected by - struct signature via ``SemanticTypeRegistry``) with their content-hash strings. - """ - # TODO: Process in batchwise/chunk-wise fashion for memory efficiency - # Currently using to_pylist() for simplicity but this loads entire table into memory - - new_columns = [] - new_fields = [] - - for i, field in enumerate(table.schema): - column_data = table.column(i).to_pylist() - - try: - if pa.types.is_struct(field.type): - converter = self.semantic_registry.get_converter_for_struct_signature(field.type) - if converter is not None: - # Semantic struct — replace with hash strings - processed_data = [ - converter.hash_struct_dict(row) if row is not None else None - for row in column_data - ] - new_type = pa.large_string() - new_columns.append(pa.array(processed_data, type=new_type)) - new_fields.append(pa.field(field.name, new_type)) - continue - - # Not a semantic type — pass through unchanged - new_columns.append(table.column(i)) - new_fields.append(field) - - except Exception as e: - raise RuntimeError( - f"Failed to process column '{field.name}': {str(e)}" - ) from e - - # Return new table with processed columns - return pa.table(new_columns, schema=pa.schema(new_fields)) - - def _sort_table_columns(self, table: pa.Table) -> pa.Table: - """Sort table columns by field name for deterministic ordering.""" - # Get sorted column names - sorted_column_names = sorted(table.column_names) - - # Use select to reorder columns - much cleaner! - return table.select(sorted_column_names) - - def serialize_arrow_table(self, table: pa.Table) -> bytes: - """ - Serialize Arrow table using the configured serialization method. - - Args: - table: Arrow table to serialize - - Returns: - Serialized bytes of the table - """ - serialization_method_function = SERIALIZATION_METHOD_LUT[ - self.serialization_method - ] - return serialization_method_function(table) - - def hash_table(self, table: pa.Table | pa.RecordBatch) -> ContentHash: - """ - Compute stable hash of Arrow table with semantic type processing. - - Args: - table: Arrow table to hash - prefix_hasher_id: Whether to prefix hash with hasher ID - - Returns: - Hex string of the computed hash - """ - - # Step 1: Process columns with semantic types using visitor pattern - processed_table = self._process_table_columns(table) - - # Step 2: Sort columns by name for deterministic ordering - sorted_table = self._sort_table_columns(processed_table) - - # normalize all string to large strings (for compatibility with Polars) - normalized_table = arrow_utils.normalize_table_to_large_types(sorted_table) - - # Step 3: Serialize using configured serialization method - serialized_bytes = self.serialize_arrow_table(normalized_table) - - # Step 4: Compute final hash - hasher = hashlib.new(self.hash_algorithm) - hasher.update(serialized_bytes) - - return ContentHash(method=self.hasher_id, digest=hasher.digest()) - - def hash_table_with_metadata(self, table: pa.Table) -> dict[str, Any]: # noqa: C901 - """ - Compute hash with additional metadata about the process. - - Returns: - Dictionary containing hash, metadata, and processing info - """ - # Process table to see what transformations were made - processed_table = self._process_table_columns(table) - - # Track processing steps - processed_columns = [] - for i, (original_field, processed_field) in enumerate( - zip(table.schema, processed_table.schema) - ): - column_info = { - "name": original_field.name, - "original_type": str(original_field.type), - "processed_type": str(processed_field.type), - "was_processed": str(original_field.type) != str(processed_field.type), - } - processed_columns.append(column_info) - - # Compute hash - table_hash = self.hash_table(table) - - return { - "hash": table_hash, - "hasher_id": self.hasher_id, - "serialization_method": self.serialization_method, - "hash_algorithm": self.hash_algorithm, - "num_rows": len(table), - "num_columns": len(table.schema), - "processed_columns": processed_columns, - "column_order": [field.name for field in table.schema], - } - class StarfixArrowHasher: - """ - Arrow table hasher backed by the starfix-python ``ArrowDigester``. - - This hasher produces cross-language-compatible, deterministic content - addresses for Arrow tables and schemas by delegating to the canonical - StarFix specification (``starfix-python``). + """Arrow table hasher backed by the starfix-python ``ArrowDigester``. Pipeline -------- 1. **Semantic pre-processing** — the ``SemanticHashingVisitor`` traverses - every column and replaces recognised extension-typed columns (e.g. ``Path``) - with their content-addressed hash bytes. This step runs before the Arrow - bytes are ever touched by starfix, so the final hash captures *file content* - for path-typed columns rather than the raw path string. - 2. **Starfix hashing** — ``ArrowDigester.hash_table`` (or - ``ArrowDigester.hash_schema``) is called on the pre-processed table / - schema. The digester is column-order-independent and normalises - ``Utf8`` → ``LargeUtf8``, ``Binary`` → ``LargeBinary``, etc., - producing a 35-byte versioned SHA-256 digest that is byte-for-byte - identical to the Rust ``starfix`` crate output. + every column. Extension-typed columns whose Python type has a registered + semantic hasher are replaced with ``pa.large_binary()`` hash tokens + (e.g. ``Path`` columns are replaced by their file-content hash). + Extension-typed columns without a registered hasher pass through with + their full extension metadata intact. + 2. **Starfix hashing** — ``ArrowDigester.hash_table`` produces a 35-byte + versioned SHA-256 digest that is byte-for-byte identical to the Rust + ``starfix`` crate output. Parameters ---------- type_converter: - ``UniversalTypeConverter`` used by ``SemanticHashingVisitor`` to resolve - Arrow extension types to Python types and convert storage values. - python_hasher: - ``SemanticAwarePythonHasher`` used by ``SemanticHashingVisitor`` to hash - Python objects produced from extension-typed columns. + ``UniversalTypeConverter`` used to resolve extension types to Python + types and convert storage values back to Python objects. + semantic_hasher: + ``SemanticAwarePythonHasher`` used to hash Python objects extracted + from extension-typed columns. hasher_id: - String identifier embedded in every ``ContentHash`` produced by - this hasher. Bump this value whenever the hash algorithm changes - so that stored hashes remain distinguishable. + String identifier embedded in every ``ContentHash`` produced by this + hasher. """ def __init__( self, type_converter: "UniversalTypeConverter", + semantic_hasher: "SemanticAwarePythonHasher", hasher_id: str, - python_hasher: "SemanticAwarePythonHasher | None" = None, ) -> None: - self._hasher_id = hasher_id self._type_converter = type_converter - self._python_hasher = python_hasher + self._semantic_hasher = semantic_hasher + self._hasher_id = hasher_id @property def hasher_id(self) -> str: return self._hasher_id - def _get_python_hasher(self) -> "SemanticAwarePythonHasher": - """Return the python_hasher, lazily resolving from default context if not set. - - Lazy resolution breaks the circular dependency that would arise if ``arrow_hasher`` - were constructed before ``semantic_hasher`` in the context JSON spec (which is the - natural order since ``type_handler_registry`` references ``arrow_hasher`` for - ``ArrowTableSemanticHasher``). - """ - if self._python_hasher is not None: - return self._python_hasher - from orcapod.contexts import get_default_context - return get_default_context().semantic_hasher # type: ignore[return-value] - - def _process_table_columns(self, table: pa.Table | pa.RecordBatch) -> pa.Table: - """Replace extension-typed columns with their content-hash bytes.""" + def _process_table_columns(self, table: "pa.Table | pa.RecordBatch") -> "pa.Table": + """Replace semantic-typed columns with their content-hash bytes.""" new_columns: list[pa.Array] = [] new_fields: list[pa.Field] = [] - python_hasher = self._get_python_hasher() - for i, field in enumerate(table.schema): - # Short-circuit: primitive columns (non-extension, non-struct, non-list, non-map) - # cannot contain extension semantic types, so skip the costly Python round-trip - # and reuse the original Arrow array directly. + # Short-circuit: columns that cannot contain semantic types skip + # the costly Python round-trip. Extension types must pass through + # so visit_extension can process them. if not ( isinstance(field.type, pa.ExtensionType) or pa.types.is_struct(field.type) @@ -319,28 +78,20 @@ def _process_table_columns(self, table: pa.Table | pa.RecordBatch) -> pa.Table: continue column_data = table.column(i).to_pylist() - visitor = SemanticHashingVisitor(self._type_converter, python_hasher) + visitor = SemanticHashingVisitor(self._type_converter, self._semantic_hasher) try: new_type: pa.DataType | None = None processed_data: list[Any] = [] for value in column_data: processed_type, processed_value = visitor.visit(field.type, value) - # Infer the output type from the first non-null processed value. - # When the first row is null, visit_struct returns the original - # struct type rather than the converted type (e.g. large_string), - # which would cause pa.array() to fail for subsequent non-null rows. if new_type is None and processed_value is not None: new_type = processed_type processed_data.append(processed_value) - # For empty or all-null columns there are no non-null values to infer - # the type from; fall back to the field's declared type. if new_type is None: new_type = field.type new_columns.append(pa.array(processed_data, type=new_type)) - # Preserve original field attributes (nullable, metadata) while - # updating only the type, so the schema fed to starfix remains faithful. new_fields.append(field.with_type(new_type)) except Exception as exc: @@ -348,61 +99,21 @@ def _process_table_columns(self, table: pa.Table | pa.RecordBatch) -> pa.Table: f"Failed to process column '{field.name}': {exc}" ) from exc - # Preserve the original schema-level metadata while using updated fields. - return pa.table(new_columns, schema=pa.schema(new_fields, metadata=table.schema.metadata)) - - def hash_schema(self, schema: pa.Schema) -> ContentHash: - """Hash an Arrow schema using the starfix canonical algorithm. - - ``has_extension_metadata`` is checked first on the raw schema. When - no extension metadata is found, ``include_metadata=False`` is passed - to ``ArrowDigester`` directly without rebuilding the schema (starfix - ignores metadata when ``include_metadata=False``, so the hash is - identical). When extension metadata is present, ``clean_schema_for_hashing`` - strips non-``ARROW:extension:*`` keys before hashing with - ``include_metadata=True``, preserving byte-for-byte hash stability - with pre-v0.3.0 output for extension-free schemas. + return pa.table( + new_columns, + schema=pa.schema(new_fields, metadata=table.schema.metadata), + ) - Parameters - ---------- - schema: - The ``pa.Schema`` to hash. - - Returns - ------- - ContentHash - A ``ContentHash`` whose ``digest`` is the 35-byte versioned - SHA-256 produced by ``ArrowDigester.hash_schema``. - """ + def hash_schema(self, schema: "pa.Schema") -> ContentHash: + """Hash an Arrow schema using the starfix canonical algorithm.""" include_meta = has_extension_metadata(schema) if include_meta: schema = clean_schema_for_hashing(schema) digest = ArrowDigester.hash_schema(schema, include_metadata=include_meta) return ContentHash(method=self._hasher_id, digest=digest) - def hash_table(self, table: pa.Table | pa.RecordBatch) -> ContentHash: - """Hash an Arrow table (or ``RecordBatch``) using starfix. - - Semantic types are resolved to their content-hash strings first. - ``has_extension_metadata`` is then checked on the processed table's - schema. When no extension metadata is found, the processed table is - passed to ``ArrowDigester.hash_table`` directly with - ``include_metadata=False``, avoiding a schema rebuild and new table - allocation. When extension metadata is present, - ``clean_schema_for_hashing`` strips non-``ARROW:extension:*`` keys - before hashing with ``include_metadata=True``. - - Parameters - ---------- - table: - The ``pa.Table`` or ``pa.RecordBatch`` to hash. - - Returns - ------- - ContentHash - A ``ContentHash`` whose ``digest`` is the 35-byte versioned - SHA-256 produced by ``ArrowDigester.hash_table``. - """ + def hash_table(self, table: "pa.Table | pa.RecordBatch") -> ContentHash: + """Hash an Arrow table (or ``RecordBatch``) using starfix.""" if isinstance(table, pa.RecordBatch): table = pa.Table.from_batches([table]) @@ -410,8 +121,6 @@ def hash_table(self, table: pa.Table | pa.RecordBatch) -> ContentHash: include_meta = has_extension_metadata(processed_table.schema) if include_meta: clean_schema = clean_schema_for_hashing(processed_table.schema) - # clean_schema_for_hashing only strips metadata; physical types and - # column order are unchanged, so from_arrays is safe without a cast. clean_table = pa.Table.from_arrays( processed_table.columns, schema=clean_schema ) From ba3d977c9b16a61a2fe4474adbc2726e9cf13acd Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 19:01:52 +0000 Subject: [PATCH 17/33] test(starfix_arrow_hasher): update _make_hasher() for new constructor - Added semantic_hasher=ctx.semantic_hasher to _make_hasher() - Moved get_default_context import inside _make_hasher() (no top-level import needed) Co-Authored-By: Claude Sonnet 4.6 --- tests/test_hashing/test_starfix_arrow_hasher.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_hashing/test_starfix_arrow_hasher.py b/tests/test_hashing/test_starfix_arrow_hasher.py index 0c6ff67e..4734e436 100644 --- a/tests/test_hashing/test_starfix_arrow_hasher.py +++ b/tests/test_hashing/test_starfix_arrow_hasher.py @@ -27,7 +27,6 @@ import pytest import pyarrow as pa -from orcapod.contexts import get_default_context from orcapod.hashing.arrow_hashers import StarfixArrowHasher from orcapod.hashing.versioned_hashers import ( _CURRENT_ARROW_HASHER_ID, @@ -46,9 +45,11 @@ def _make_hasher() -> StarfixArrowHasher: + from orcapod.contexts import get_default_context ctx = get_default_context() return StarfixArrowHasher( type_converter=ctx.type_converter, + semantic_hasher=ctx.semantic_hasher, hasher_id=HASHER_ID, ) From 4cf7001abf2da34b9fd9c5bb899343bcb207a0f2 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 19:11:04 +0000 Subject: [PATCH 18/33] feat(v0.1): wire extension type hashing into default context; remove semantic_registry - Rewrote v0.1.json: removed semantic_registry and type_handler_registry keys - Added python_type_semantic_hasher_registry key with all type handlers - arrow_hasher now wires in both type_converter and semantic_hasher refs - pa.Table/pa.RecordBatch handlers added back using lazy arrow_hasher resolution to break the circular dep (ArrowTableSemanticHasher now accepts optional arg) - context_schema.json: removed semantic_registry property, renamed type_handler_registry -> python_type_semantic_hasher_registry - versioned_hashers.py: get_versioned_semantic_arrow_hasher() now sources both type_converter and semantic_hasher from default context via resolve_context() Co-Authored-By: Claude Sonnet 4.6 --- .../contexts/data/schemas/context_schema.json | 8 +- src/orcapod/contexts/data/v0.1.json | 77 +++++++------------ .../semantic_hashing/builtin_handlers.py | 17 +++- src/orcapod/hashing/versioned_hashers.py | 24 ++---- 4 files changed, 51 insertions(+), 75 deletions(-) diff --git a/src/orcapod/contexts/data/schemas/context_schema.json b/src/orcapod/contexts/data/schemas/context_schema.json index 1a6ac840..0465b47d 100644 --- a/src/orcapod/contexts/data/schemas/context_schema.json +++ b/src/orcapod/contexts/data/schemas/context_schema.json @@ -41,10 +41,6 @@ "Enhanced version with timestamp support and improved hashing" ] }, - "semantic_registry": { - "$ref": "#/$defs/objectspec", - "description": "ObjectSpec for the semantic registry" - }, "type_converter": { "$ref": "#/$defs/objectspec", "description": "ObjectSpec for the python-arrow type converter" @@ -57,9 +53,9 @@ "$ref": "#/$defs/objectspec", "description": "ObjectSpec for the semantic hasher component" }, - "type_handler_registry": { + "python_type_semantic_hasher_registry": { "$ref": "#/$defs/objectspec", - "description": "ObjectSpec for the TypeHandlerRegistry used by the semantic hasher" + "description": "ObjectSpec for the PythonTypeSemanticHasherRegistry used by the semantic hasher" }, "file_hasher": { "$ref": "#/$defs/objectspec", diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json index a25d6e60..447db766 100644 --- a/src/orcapod/contexts/data/v0.1.json +++ b/src/orcapod/contexts/data/v0.1.json @@ -1,32 +1,13 @@ { "context_key": "std:v0.1:default", "version": "v0.1", - "description": "Initial stable release with basic Path semantic type support", + "description": "Initial stable release with extension type hashing support", "file_hasher": { "_class": "orcapod.hashing.file_hashers.BasicFileHasher", "_config": { "algorithm": "sha256" } }, - "semantic_registry": { - "_class": "orcapod.semantic_types.semantic_registry.SemanticTypeRegistry", - "_config": { - "converters": { - "upath": { - "_class": "orcapod.semantic_types.semantic_struct_converters.UPathStructConverter", - "_config": { - "file_hasher": {"_ref": "file_hasher"} - } - }, - "path": { - "_class": "orcapod.semantic_types.semantic_struct_converters.PythonPathStructConverter", - "_config": { - "file_hasher": {"_ref": "file_hasher"} - } - } - } - } - }, "type_converter": { "_class": "orcapod.semantic_types.universal_converter.UniversalTypeConverter", "_config": { @@ -76,34 +57,25 @@ "include_defaults": true } }, - "arrow_hasher": { - "_class": "orcapod.hashing.arrow_hashers.StarfixArrowHasher", - "_config": { - "hasher_id": "arrow_v0.1", - "type_converter": { - "_ref": "type_converter" - } - } - }, - "type_handler_registry": { + "python_type_semantic_hasher_registry": { "_class": "orcapod.hashing.semantic_hashing.type_handler_registry.PythonTypeSemanticHasherRegistry", "_config": { "handlers": [ - [{"_type": "builtins.bytes"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesSemanticHasher", "_config": {}}], - [{"_type": "builtins.bytearray"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesSemanticHasher", "_config": {}}], - [{"_type": "pathlib.Path"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.PathSemanticHasher", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], - [{"_type": "upath.core.UPath"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UPathSemanticHasher", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], - [{"_type": "uuid.UUID"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UUIDSemanticHasher", "_config": {}}], - [{"_type": "types.FunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionSemanticHasher", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], - [{"_type": "types.BuiltinFunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionSemanticHasher", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], - [{"_type": "types.MethodType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionSemanticHasher", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], - [{"_type": "builtins.type"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.TypeObjectSemanticHasher", "_config": {}}], - [{"_type": "types.GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasSemanticHasher", "_config": {}}], - [{"_type": "types.UnionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UnionTypeSemanticHasher", "_config": {}}], - [{"_type": "typing._GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasSemanticHasher", "_config": {}}], - [{"_type": "typing._SpecialForm"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.SpecialFormSemanticHasher", "_config": {}}], - [{"_type": "pyarrow.Table"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableSemanticHasher", "_config": {"arrow_hasher": {"_ref": "arrow_hasher"}}}], - [{"_type": "pyarrow.RecordBatch"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableSemanticHasher", "_config": {"arrow_hasher": {"_ref": "arrow_hasher"}}}] + [{"_type": "builtins.bytes"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesSemanticHasher", "_config": {}}], + [{"_type": "builtins.bytearray"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesSemanticHasher", "_config": {}}], + [{"_type": "pathlib.Path"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.PathSemanticHasher", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], + [{"_type": "upath.core.UPath"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UPathSemanticHasher", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], + [{"_type": "uuid.UUID"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UUIDSemanticHasher", "_config": {}}], + [{"_type": "types.FunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionSemanticHasher", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], + [{"_type": "types.BuiltinFunctionType"},{"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionSemanticHasher", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], + [{"_type": "types.MethodType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionSemanticHasher", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], + [{"_type": "builtins.type"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.TypeObjectSemanticHasher", "_config": {}}], + [{"_type": "types.GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasSemanticHasher", "_config": {}}], + [{"_type": "types.UnionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UnionTypeSemanticHasher", "_config": {}}], + [{"_type": "typing._GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasSemanticHasher", "_config": {}}], + [{"_type": "typing._SpecialForm"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.SpecialFormSemanticHasher", "_config": {}}], + [{"_type": "pyarrow.Table"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableSemanticHasher", "_config": {}}], + [{"_type": "pyarrow.RecordBatch"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableSemanticHasher", "_config": {}}] ] } }, @@ -112,18 +84,27 @@ "_config": { "hasher_id": "semantic_v0.1", "type_semantic_hasher_registry": { - "_ref": "type_handler_registry" + "_ref": "python_type_semantic_hasher_registry" } } }, + "arrow_hasher": { + "_class": "orcapod.hashing.arrow_hashers.StarfixArrowHasher", + "_config": { + "hasher_id": "arrow_v0.1", + "type_converter": {"_ref": "type_converter"}, + "semantic_hasher": {"_ref": "semantic_hasher"} + } + }, "metadata": { - "created_date": "2025-08-01", + "created_date": "2026-06-24", "author": "OrcaPod Core Team", "changelog": [ "Initial release with Path semantic type support", "Basic SHA-256 hashing for files and objects", "Arrow logical serialization method", - "Introduced arrow_v0.1 StarfixArrowHasher using starfix ArrowDigester for cross-language-compatible Arrow hashing" + "Introduced arrow_v0.1 StarfixArrowHasher using starfix ArrowDigester for cross-language-compatible Arrow hashing", + "Hard cut: replaced shape-based SemanticTypeRegistry with extension-type hashing; renamed all hashing classes to cleaner names" ] } } diff --git a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py index 48e7dc12..fd5cef22 100644 --- a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py +++ b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py @@ -191,11 +191,20 @@ class ArrowTableSemanticHasher: """Hasher for ``pa.Table`` and ``pa.RecordBatch`` objects. Args: - arrow_hasher: Any object satisfying ``ArrowHasherProtocol``. + arrow_hasher: Any object satisfying ``ArrowHasherProtocol``. When + ``None``, the default data context's ``arrow_hasher`` is resolved + lazily at call time (breaking the circular dependency that would + arise if the registry were constructed before the arrow hasher). """ - def __init__(self, arrow_hasher: "ArrowHasherProtocol") -> None: - self.arrow_hasher = arrow_hasher + def __init__(self, arrow_hasher: "ArrowHasherProtocol | None" = None) -> None: + self._arrow_hasher = arrow_hasher + + def _get_arrow_hasher(self) -> "ArrowHasherProtocol": + if self._arrow_hasher is not None: + return self._arrow_hasher + from orcapod.contexts import get_default_context + return get_default_context().arrow_hasher # type: ignore[return-value] def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: import pyarrow as _pa @@ -206,7 +215,7 @@ def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: raise TypeError( f"ArrowTableSemanticHasher: expected pa.Table or pa.RecordBatch, got {type(obj)!r}" ) - return self.arrow_hasher.hash_table(obj) + return self._get_arrow_hasher().hash_table(obj) class SchemaSemanticHasher: diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index 080cbec6..784d3617 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -93,27 +93,16 @@ def get_versioned_semantic_hasher( def get_versioned_semantic_arrow_hasher( hasher_id: str = _CURRENT_ARROW_HASHER_ID, ) -> hp.ArrowHasherProtocol: - """ - Return a StarfixArrowHasher configured for the current version. - - The arrow hasher handles Arrow table / RecordBatch hashing with - extension-type awareness (e.g. Path columns are hashed by file content). - - Parameters - ---------- - hasher_id: - Identifier embedded in every ContentHash produced by this hasher. + """Return a StarfixArrowHasher configured for the current version. - Returns - ------- - ArrowHasherProtocol - A fully configured StarfixArrowHasher instance. + Sources ``type_converter`` and ``semantic_hasher`` from the default + ``DataContext`` so that the arrow hasher is consistent with all other + versioned components. """ - from orcapod.contexts import get_default_context from orcapod.hashing.arrow_hashers import StarfixArrowHasher + from orcapod.contexts import resolve_context - ctx = get_default_context() - + ctx = resolve_context(None) # default context logger.debug( "get_versioned_semantic_arrow_hasher: creating StarfixArrowHasher " "(hasher_id=%r)", @@ -122,4 +111,5 @@ def get_versioned_semantic_arrow_hasher( return StarfixArrowHasher( hasher_id=hasher_id, type_converter=ctx.type_converter, + semantic_hasher=ctx.semantic_hasher, ) From f72832ab75a49b263b58adfd3e666bbf341dbd44 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 19:16:57 +0000 Subject: [PATCH 19/33] =?UTF-8?q?feat(PLT-1660):=20hard=20cut=20=E2=80=94?= =?UTF-8?q?=20delete=20SemanticTypeRegistry=20and=20old=20struct-based=20h?= =?UTF-8?q?ashing=20system?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Deleted src/orcapod/semantic_types/semantic_registry.py - Deleted src/orcapod/semantic_types/semantic_struct_converters.py - Removed SemanticTypeRegistry export from semantic_types/__init__.py - Removed SemanticStructConverterProtocol from protocols/semantic_types_protocols.py - Deleted tests/test_hashing/test_file_hashing_consistency.py (used SemanticArrowHasher) - Deleted tests/test_semantic_types/ directory (tested deleted classes) - Updated docstrings/comments to remove old class name references - ArrowTableSemanticHasher: made arrow_hasher optional with lazy context resolution to break the circular dep (registry -> ArrowTableSemanticHasher -> arrow_hasher -> registry) - context_schema.json: updated descriptions and examples to use new class names Co-Authored-By: Claude Sonnet 4.6 --- .../contexts/data/schemas/context_schema.json | 49 +- src/orcapod/core/datagrams/datagram.py | 4 +- src/orcapod/extension_types/registry.py | 4 +- src/orcapod/hashing/defaults.py | 2 +- src/orcapod/hashing/versioned_hashers.py | 6 +- .../protocols/semantic_types_protocols.py | 50 - src/orcapod/semantic_types/__init__.py | 2 - .../semantic_types/semantic_registry.py | 246 ---- .../semantic_struct_converters.py | 333 ------ .../test_file_hashing_consistency.py | 219 ---- .../test_path_struct_converter.py | 132 --- .../test_semantic_types/test_pydata_utils.py | 136 --- .../test_schema_arrow_equality.py | 324 ------ .../test_semantic_registry.py | 235 ---- .../test_semantic_struct_converters.py | 107 -- .../test_universal_converter.py | 1029 ----------------- .../test_upath_struct_converter.py | 148 --- .../test_uuid_struct_converter.py | 134 --- 18 files changed, 23 insertions(+), 3137 deletions(-) delete mode 100644 src/orcapod/semantic_types/semantic_registry.py delete mode 100644 src/orcapod/semantic_types/semantic_struct_converters.py delete mode 100644 tests/test_hashing/test_file_hashing_consistency.py delete mode 100644 tests/test_semantic_types/test_path_struct_converter.py delete mode 100644 tests/test_semantic_types/test_pydata_utils.py delete mode 100644 tests/test_semantic_types/test_schema_arrow_equality.py delete mode 100644 tests/test_semantic_types/test_semantic_registry.py delete mode 100644 tests/test_semantic_types/test_semantic_struct_converters.py delete mode 100644 tests/test_semantic_types/test_universal_converter.py delete mode 100644 tests/test_semantic_types/test_upath_struct_converter.py delete mode 100644 tests/test_semantic_types/test_uuid_struct_converter.py diff --git a/src/orcapod/contexts/data/schemas/context_schema.json b/src/orcapod/contexts/data/schemas/context_schema.json index 0465b47d..1a908dfc 100644 --- a/src/orcapod/contexts/data/schemas/context_schema.json +++ b/src/orcapod/contexts/data/schemas/context_schema.json @@ -59,11 +59,11 @@ }, "file_hasher": { "$ref": "#/$defs/objectspec", - "description": "ObjectSpec for the file content hasher (used by PathContentHandler)" + "description": "ObjectSpec for the file content hasher (used by PathSemanticHasher)" }, "function_info_extractor": { "$ref": "#/$defs/objectspec", - "description": "ObjectSpec for the function info extractor (used by FunctionHandler)" + "description": "ObjectSpec for the function info extractor (used by FunctionSemanticHasher)" }, "metadata": { "type": "object", @@ -163,51 +163,32 @@ { "context_key": "std:v0.1:default", "version": "v0.1", - "description": "Initial stable release with basic Path semantic type support", - "semantic_type_registry": { - "_class": "orcapod.types.semantic_types.SemanticTypeRegistry", - "_config": { - "converters": [ - { - "_class": "orcapod.types.semantic_types.PythonPathStructConverter", - "_config": {} - } - ] - } + "description": "Initial stable release with extension type hashing support", + "type_converter": { + "_class": "orcapod.semantic_types.universal_converter.UniversalTypeConverter", + "_config": {} }, "arrow_hasher": { - "_class": "orcapod.hashing.arrow_hashers.SemanticArrowHasher", + "_class": "orcapod.hashing.arrow_hashers.StarfixArrowHasher", "_config": { "hasher_id": "arrow_v0.1", - "hash_algorithm": "sha256", - "serialization_method": "logical", - "semantic_type_hashers": { - "path": { - "_class": "orcapod.hashing.semantic_type_hashers.PathHasher", - "_config": { - "file_hasher": { - "_class": "orcapod.hashing.file_hashers.BasicFileHasher", - "_config": { - "algorithm": "sha256" - } - } - } - } - } + "type_converter": {"_ref": "type_converter"}, + "semantic_hasher": {"_ref": "semantic_hasher"} } }, "semantic_hasher": { - "_class": "orcapod.hashing.semantic_hashing.semantic_hasher.BaseSemanticHasher", + "_class": "orcapod.hashing.semantic_hashing.semantic_hasher.SemanticAwarePythonHasher", "_config": { - "hasher_id": "semantic_v0.1" + "hasher_id": "semantic_v0.1", + "type_semantic_hasher_registry": {"_ref": "python_type_semantic_hasher_registry"} } }, "metadata": { - "created_date": "2025-08-01", + "created_date": "2026-06-24", "author": "OrcaPod Team", "changelog": [ - "Initial release with semantic type registry", - "Basic Arrow and object hashing capabilities" + "Initial release with extension type hashing support", + "StarfixArrowHasher for cross-language-compatible Arrow hashing" ] } } diff --git a/src/orcapod/core/datagrams/datagram.py b/src/orcapod/core/datagrams/datagram.py index 8fa2b48b..5ebae203 100644 --- a/src/orcapod/core/datagrams/datagram.py +++ b/src/orcapod/core/datagrams/datagram.py @@ -12,7 +12,7 @@ - **Dict for value access**: ``__getitem__``, ``get``, ``as_dict()`` always operate through the Python dict (loaded lazily from Arrow when needed). - **Arrow for hashing**: ``content_hash()`` always uses the Arrow table (loaded lazily from - dict when needed) via the data context's ``ArrowTableHandler``. + dict when needed) via the data context's ``ArrowTableSemanticHasher``. - **Meta is always dict**: meta columns are stored as a Python dict regardless of how the primary data was provided; the Arrow meta table is built lazily. """ @@ -418,7 +418,7 @@ def arrow_schema( def identity_structure(self) -> Any: """Return the primary data table as this datagram's identity. - The semantic hasher dispatches ``pa.Table`` to ``ArrowTableHandler``, + The semantic hasher dispatches ``pa.Table`` to ``ArrowTableSemanticHasher``, which delegates to the data context's ``arrow_hasher``. This means ``content_hash()`` (inherited from ``ContentIdentifiableBase``) produces a stable, content-addressed hash of the data columns without any diff --git a/src/orcapod/extension_types/registry.py b/src/orcapod/extension_types/registry.py index 8711b59b..32090242 100644 --- a/src/orcapod/extension_types/registry.py +++ b/src/orcapod/extension_types/registry.py @@ -190,8 +190,8 @@ class LogicalTypeRegistry: An optional ``logical_types`` list can be passed at construction time to pre-register one or more ``LogicalTypeProtocol`` instances immediately, following - the same pattern as ``SemanticTypeRegistry``'s ``converters`` constructor - argument. + the same pattern as the ``logical_types`` constructor argument used by + other registries in this package. An optional ``factories`` list can also be passed to pre-register ``LogicalTypeFactoryProtocol`` instances at construction time. Each entry is a diff --git a/src/orcapod/hashing/defaults.py b/src/orcapod/hashing/defaults.py index 0082c453..0dc8b6c2 100644 --- a/src/orcapod/hashing/defaults.py +++ b/src/orcapod/hashing/defaults.py @@ -87,7 +87,7 @@ def get_default_arrow_hasher( else: string_cacher = cache_file_hash - # set_cacher is present on SemanticArrowHasher but not on the + # set_cacher is present on StarfixArrowHasher but not on the # ArrowHasherProtocol protocol, so we call it via Any to avoid a type error. arrow_hasher.set_cacher("path", string_cacher) diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index 784d3617..a7fed13f 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -14,8 +14,8 @@ recursive hasher that replaces BasicObjectHasher). get_versioned_semantic_arrow_hasher() - Return the current-version SemanticArrowHasher (Arrow table hasher - with semantic-type support). + Return the current-version StarfixArrowHasher (Arrow table hasher + with extension-type semantic support). """ from __future__ import annotations @@ -86,7 +86,7 @@ def get_versioned_semantic_hasher( # --------------------------------------------------------------------------- -# SemanticArrowHasher factory +# StarfixArrowHasher factory # --------------------------------------------------------------------------- diff --git a/src/orcapod/protocols/semantic_types_protocols.py b/src/orcapod/protocols/semantic_types_protocols.py index 1f0a6b05..f2303190 100644 --- a/src/orcapod/protocols/semantic_types_protocols.py +++ b/src/orcapod/protocols/semantic_types_protocols.py @@ -54,53 +54,3 @@ def get_arrow_to_python_converter( def ensure_types_registered_for_schemas(self, *schemas: Schema) -> None: ... -# Core protocols -class SemanticStructConverterProtocol(Protocol): - """Protocol for converting between Python objects and semantic structs.""" - - @property - def python_type(self) -> DataType: - """The Python type this converter can handle.""" - ... - - @property - def arrow_struct_type(self) -> "pa.StructType": - """The Arrow struct type this converter produces.""" - ... - - def python_to_struct_dict(self, value: Any) -> dict[str, Any]: - """Convert Python value to struct dictionary.""" - ... - - def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> Any: - """Convert struct dictionary back to Python value.""" - ... - - def can_handle_python_type(self, python_type: DataType) -> bool: - """Check if this converter can handle the given Python type.""" - ... - - def can_handle_struct_type(self, struct_type: "pa.StructType") -> bool: - """Check if this converter can handle the given struct type.""" - ... - - def hash_struct_dict(self, struct_dict: dict[str, Any]) -> str: - """ - Compute hash of the semantic type from its struct dictionary representation. - - Args: - struct_dict: Arrow struct dictionary representation - - Returns: - Hash string of the form ``"{type}:sha256:"``, - e.g. ``"path:sha256:abc123"`` - - Raises: - Exception: If hashing fails (e.g., file not found for path types) - """ - ... - - @property - def hasher_id(self) -> str: - """Identifier for this hasher (for debugging/versioning)""" - ... diff --git a/src/orcapod/semantic_types/__init__.py b/src/orcapod/semantic_types/__init__.py index 123777f5..f7948ee7 100644 --- a/src/orcapod/semantic_types/__init__.py +++ b/src/orcapod/semantic_types/__init__.py @@ -1,9 +1,7 @@ -from .semantic_registry import SemanticTypeRegistry from .universal_converter import UniversalTypeConverter from .type_inference import infer_python_schema_from_pylist_data __all__ = [ - "SemanticTypeRegistry", "UniversalTypeConverter", "infer_python_schema_from_pylist_data", ] diff --git a/src/orcapod/semantic_types/semantic_registry.py b/src/orcapod/semantic_types/semantic_registry.py deleted file mode 100644 index ff8c1a49..00000000 --- a/src/orcapod/semantic_types/semantic_registry.py +++ /dev/null @@ -1,246 +0,0 @@ -from __future__ import annotations - -from collections.abc import Mapping -from typing import TYPE_CHECKING, Any - -from orcapod.protocols.semantic_types_protocols import SemanticStructConverterProtocol -from orcapod.semantic_types import pydata_utils - -# from orcapod.semantic_types.type_inference import infer_python_schema_from_pylist_data -from orcapod.types import DataType, Schema -from orcapod.utils.lazy_module import LazyModule - -if TYPE_CHECKING: - import pyarrow as pa -else: - pa = LazyModule("pyarrow") - - -class SemanticTypeRegistry: - """ - Registry that manages semantic type converters using struct signature recognition. - - This registry maps Python types to PyArrow struct signatures, enabling - automatic detection and conversion of semantic types based on their - struct schema alone. - """ - - @staticmethod - def infer_python_schema_from_pylist(data: list[dict[str, Any]]) -> Schema: - """ - Infer Python schema from a list of dictionaries (pylist) - """ - return pydata_utils.infer_python_schema_from_pylist_data(data) - - @staticmethod - def infer_python_schema_from_pydict(data: dict[str, list[Any]]) -> Schema: - # TODO: consider which data type is more efficient and use that pylist or pydict - return pydata_utils.infer_python_schema_from_pylist_data( - pydata_utils.pydict_to_pylist(data) - ) - - def __init__( - self, converters: Mapping[str, SemanticStructConverterProtocol] | None = None - ): - # Bidirectional mappings between Python types and struct signatures - self._python_to_struct: dict[DataType, "pa.StructType"] = {} - self._struct_to_python: dict["pa.StructType", DataType] = {} - self._struct_to_converter: dict[ - "pa.StructType", SemanticStructConverterProtocol - ] = {} - - # Name mapping for convenience - self._name_to_converter: dict[str, SemanticStructConverterProtocol] = {} - self._struct_to_name: dict["pa.StructType", str] = {} - - # If initialized with a list of converters, register them - if converters: - for semantic_type_name, converter in converters.items(): - self.register_converter(semantic_type_name, converter) - - def register_converter( - self, semantic_type_name: str, converter: SemanticStructConverterProtocol - ) -> None: - """ - Register a semantic type converter. - - This creates bidirectional mappings between: - - Python type ↔ Arrow struct signature - - Arrow struct signature ↔ converter instance - - Optionally, a semantic type name can be provided. - """ - python_type = converter.python_type - struct_signature = converter.arrow_struct_type - - # Check for conflicts - if python_type in self._python_to_struct: - existing_struct = self._python_to_struct[python_type] - if existing_struct != struct_signature: - raise ValueError( - f"Python type {python_type} already registered with different struct signature. " - f"Existing: {existing_struct}, New: {struct_signature}" - ) - - if struct_signature in self._struct_to_python: - existing_python = self._struct_to_python[struct_signature] - if existing_python != python_type: - raise ValueError( - f"Struct signature {struct_signature} already registered with different Python type. " - f"Existing: {existing_python}, New: {python_type}" - ) - - # catch case where a different converter is already registered with the semantic type name - if existing_converter := self.get_converter_for_semantic_type( - semantic_type_name - ): - if existing_converter != converter: - raise ValueError( - f"Semantic type name '{semantic_type_name}' is already registered to {existing_converter}" - ) - - # Register bidirectional mappings - self._python_to_struct[python_type] = struct_signature - self._struct_to_python[struct_signature] = python_type - self._struct_to_converter[struct_signature] = converter - - self._name_to_converter[semantic_type_name] = converter - self._struct_to_name[struct_signature] = semantic_type_name - - def get_converter_for_python_type( - self, python_type: DataType - ) -> SemanticStructConverterProtocol | None: - """Get converter registered to the Python type.""" - # Direct lookup first - struct_signature = self._python_to_struct.get(python_type) - if struct_signature: - return self._struct_to_converter[struct_signature] - - # Handle subclass relationships - add safety check - for registered_type, struct_signature in self._python_to_struct.items(): - try: - if ( - isinstance(registered_type, type) - and isinstance(python_type, type) - and issubclass(python_type, registered_type) - ): - return self._struct_to_converter[struct_signature] - except TypeError: - # Handle cases where issubclass fails (e.g., with generic types) - continue - - return None - - def get_converter_for_semantic_type( - self, semantic_type_name: str - ) -> SemanticStructConverterProtocol | None: - """Get converter registered to the semantic type name.""" - return self._name_to_converter.get(semantic_type_name) - - def get_converter_for_struct_signature( - self, struct_signature: "pa.StructType" - ) -> SemanticStructConverterProtocol | None: - """ - Get converter registered to the Arrow struct signature. - """ - return self._struct_to_converter.get(struct_signature) - - def get_python_type_for_semantic_struct_signature( - self, struct_signature: "pa.StructType" - ) -> DataType | None: - """ - Get Python type registered to the Arrow struct signature. - """ - return self._struct_to_python.get(struct_signature) - - def get_semantic_struct_signature_for_python_type( - self, python_type: type - ) -> "pa.StructType | None": - """Get Arrow struct signature registered to the Python type.""" - return self._python_to_struct.get(python_type) - - def has_semantic_type(self, semantic_type_name: str) -> bool: - """Check if the semantic type name is registered.""" - return semantic_type_name in self._name_to_converter - - def has_python_type(self, python_type: type) -> bool: - """Check if the Python type is registered.""" - return python_type in self._python_to_struct - - def has_semantic_struct_signature(self, struct_signature: "pa.StructType") -> bool: - """Check if the struct signature is registered.""" - return struct_signature in self._struct_to_python - - def list_semantic_types(self) -> list[str]: - """Get all registered semantic type names.""" - return list(self._name_to_converter.keys()) - - def list_python_types(self) -> list[DataType]: - """Get all registered Python types.""" - return list(self._python_to_struct.keys()) - - def list_struct_signatures(self) -> list["pa.StructType"]: - """Get all registered struct signatures.""" - return list(self._struct_to_python.keys()) - - def find_semantic_fields_in_schema(self, schema: "pa.Schema") -> dict[str, str]: - """ - Find all semantic type fields in a schema by struct signature recognition. - - Args: - schema: PyArrow schema to examine - - Returns: - Dictionary mapping field names to semantic type names - - Example: - schema with fields: - - name: string - - file_path: struct - - location: struct - - Returns: {"file_path": "path", "location": "geolocation"} - """ - semantic_fields = {} - for field in schema: - if pa.types.is_struct(field.type) and field.type in self._struct_to_name: - semantic_fields[field.name] = self._struct_to_name[field.type] - return semantic_fields - - def get_semantic_field_info(self, schema: "pa.Schema") -> dict[str, dict[str, Any]]: - """ - Get detailed information about semantic fields in a schema. - - Returns: - Dictionary with field names as keys and info dictionaries as values. - Each info dict contains: semantic_type, python_type, struct_signature - """ - semantic_info = {} - for field in schema: - if pa.types.is_struct(field.type): - converter = self.get_converter_for_struct_signature(field.type) - if converter: - semantic_info[field.name] = { - "python_type": converter.python_type, - "struct_signature": field.type, - "converter": converter, - } - return semantic_info - - def validate_struct_signature( - self, struct_signature: "pa.StructType", expected_python_type: type - ) -> bool: - """ - Validate that a struct signature matches the expected Python type. - - Args: - struct_signature: Arrow struct type to validate - expected_python_type: Expected Python type - - Returns: - True if the struct signature is registered for the Python type - """ - registered_type = self.get_python_type_for_semantic_struct_signature( - struct_signature - ) - return registered_type == expected_python_type diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py deleted file mode 100644 index 54be49a2..00000000 --- a/src/orcapod/semantic_types/semantic_struct_converters.py +++ /dev/null @@ -1,333 +0,0 @@ -""" -Struct-based semantic type system for OrcaPod. - -This replaces the metadata-based approach with explicit struct fields, -making semantic types visible in schemas and preserved through operations. -""" - -from __future__ import annotations - -import uuid as _uuid_module -from abc import ABC, abstractmethod -from pathlib import Path -from typing import TYPE_CHECKING, Any - -from upath import UPath - -from orcapod.types import ContentHash -from orcapod.utils.lazy_module import LazyModule - -if TYPE_CHECKING: - import pyarrow as pa - - from orcapod.protocols.hashing_protocols import FileContentHasherProtocol -else: - pa = LazyModule("pyarrow") - - -class SemanticStructConverterBase: - """ - Base class providing common functionality for semantic struct converters. - - Subclasses only need to implement the abstract methods and can use - the common hashing infrastructure. - """ - - def __init__(self, semantic_type_name: str): - self._semantic_type_name = semantic_type_name - self._hasher_id = f"{self.semantic_type_name}_content_sha256" - - @property - def semantic_type_name(self) -> str: - """The name of the semantic type this converter handles.""" - return self._semantic_type_name - - @property - def hasher_id(self) -> str: - """Default hasher ID based on semantic type name""" - return self._hasher_id - - def _compute_content_hash(self, content: bytes) -> ContentHash: - """Compute SHA-256 hash of content bytes. - - Args: - content: Content to hash. - - Returns: - ``ContentHash`` with ``method="sha256"`` and the raw digest. - """ - import hashlib - - digest = hashlib.sha256(content).digest() - return ContentHash(method="sha256", digest=digest) - - def _format_semantic_hash(self, content_hash: ContentHash) -> str: - """Format a ``ContentHash`` into the standard semantic hash string. - - Always returns ``"{semantic_type_name}:{method}:{hex}"``, - e.g. ``"uuid:sha256:abc123"``. - - Args: - content_hash: Hash to format. - - Returns: - Formatted hash string with semantic type and algorithm prefix. - """ - return f"{self.semantic_type_name}:{content_hash.to_string(prefix_method=True)}" - - -class PathStructConverterBase(SemanticStructConverterBase, ABC): - """Base converter for file path types (Path and UPath). - - Extracts the shared conversion logic since Path and UPath have - identical APIs for the operations we need (str conversion, - construction from string, ``read_bytes``). - """ - - def __init__( - self, - name: str, - path_type: type, - file_hasher: "FileContentHasherProtocol", - ): - super().__init__(name) - self._python_type = path_type - self._field_name = name - self._file_hasher = file_hasher - self._arrow_struct_type = pa.struct([ - pa.field(name, pa.large_string()), - ]) - - @property - def python_type(self) -> type: - return self._python_type - - @property - def arrow_struct_type(self) -> "pa.StructType": - return self._arrow_struct_type - - @abstractmethod - def _make_path(self, path_str: str) -> Any: - """Construct the appropriate path object from a string.""" - ... - - def python_to_struct_dict(self, value: Any) -> dict[str, Any]: - """Convert path object to struct dictionary.""" - if not isinstance(value, self._python_type): - raise TypeError(f"Expected {self._python_type.__name__}, got {type(value)}") - return {self._field_name: str(value)} - - def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> Any: - """Convert struct dictionary back to path object.""" - path_str = struct_dict.get(self._field_name) - if path_str is None: - raise ValueError(f"Missing '{self._field_name}' field in struct") - return self._make_path(path_str) - - def can_handle_python_type(self, python_type: type) -> bool: - """Check if this converter can handle the given Python type.""" - return issubclass(python_type, self._python_type) - - def can_handle_struct_type(self, struct_type: "pa.StructType") -> bool: - """Check if this converter can handle the given struct type.""" - for field in self._arrow_struct_type: - if ( - field.name not in struct_type.names - or struct_type[field.name].type != field.type - ): - return False - return True - - def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool: - """Check if a struct dictionary represents this semantic type.""" - return ( - set(struct_dict.keys()) == {self._field_name} - and isinstance(struct_dict[self._field_name], str) - ) - - def hash_struct_dict(self, struct_dict: dict[str, Any]) -> str: - """Compute hash of a path semantic type by hashing the file content. - - Returns a string of the form ``"{type}:{algorithm}:{hex}"``, - e.g. ``"path:sha256:abc123"``. - - Args: - struct_dict: Dict with the path field containing a file path string. - - Returns: - Hash string of the file content with semantic type and algorithm prefix. - - Raises: - FileNotFoundError: If the path does not exist. - IsADirectoryError: If the path is a directory. - """ - path_str = struct_dict.get(self._field_name) - if path_str is None: - raise ValueError(f"Missing '{self._field_name}' field in struct dict") - - path = self._make_path(path_str) - if not path.exists(): - raise FileNotFoundError(f"Path does not exist: {path}") - if path.is_dir(): - raise IsADirectoryError(f"Path is a directory: {path}") - - file_hash = self._file_hasher.hash_file(path) - return self._format_semantic_hash(file_hash) - - -class PythonPathStructConverter(PathStructConverterBase): - """Converter for pathlib.Path objects to/from semantic structs. - - Rejects ``UPath`` instances to avoid ambiguity with - ``UPathStructConverter``, since ``UPath`` is a ``Path`` subclass. - """ - - def __init__(self, file_hasher: "FileContentHasherProtocol"): - super().__init__("path", Path, file_hasher) - - def _make_path(self, path_str: str) -> Path: - return Path(path_str) - - def python_to_struct_dict(self, value: Any) -> dict[str, Any]: - """Convert Path to struct dictionary, rejecting UPath instances.""" - if isinstance(value, UPath): - raise TypeError( - f"Expected Path (not UPath), got {type(value)}. " - "Use UPathStructConverter for UPath instances." - ) - return super().python_to_struct_dict(value) - - def can_handle_python_type(self, python_type: type) -> bool: - """Check if this converter can handle the given Python type. - - Returns False for UPath (and its subclasses) to avoid ambiguity. - """ - if issubclass(python_type, UPath): - return False - return issubclass(python_type, Path) - - -class UPathStructConverter(PathStructConverterBase): - """Converter for universal_pathlib.UPath objects to/from semantic structs.""" - - def __init__(self, file_hasher: "FileContentHasherProtocol"): - super().__init__("upath", UPath, file_hasher) - - def _make_path(self, path_str: str) -> UPath: - return UPath(path_str) - - -class UUIDStructConverter(SemanticStructConverterBase): - """Converter for ``uuid.UUID`` objects to/from Arrow semantic structs. - - Stores UUIDs as fixed 16-byte binary values inside a single-field struct, - following the same pattern as ``PythonPathStructConverter`` and - ``UPathStructConverter``. - - Note: - ``uuid_utils.UUID`` objects (e.g. from ``uuid7()``) are accepted via - duck typing because they expose a ``.bytes`` attribute but do not - inherit from ``uuid.UUID``. - """ - - def __init__(self) -> None: - super().__init__("uuid") - self._python_type = _uuid_module.UUID - self._arrow_struct_type = pa.struct([pa.field("uuid", pa.binary(16))]) - - @property - def python_type(self) -> type: - """The Python type this converter handles (``uuid.UUID``).""" - return self._python_type - - @property - def arrow_struct_type(self) -> "pa.StructType": - """The Arrow struct type used for serialisation.""" - return self._arrow_struct_type - - def python_to_struct_dict(self, value: Any) -> dict[str, bytes]: - """Convert a UUID to a struct dictionary with a single ``uuid`` field. - - Accepts both ``uuid.UUID`` instances and duck-typed UUID-compatible - objects (e.g. ``uuid_utils.UUID``) that expose a ``.bytes`` attribute - returning 16 raw bytes. - - Args: - value: A ``uuid.UUID`` instance or compatible UUID-like object. - - Returns: - A dict with a single key ``"uuid"`` whose value is 16 raw bytes. - - Raises: - TypeError: If ``value`` is not a ``uuid.UUID`` instance or - compatible duck-typed UUID object. - """ - if isinstance(value, _uuid_module.UUID): - return {"uuid": value.bytes} - # Accept uuid_utils.UUID and other duck-typed UUID objects - raw = getattr(value, "bytes", None) - if isinstance(raw, bytes) and len(raw) == 16: - return {"uuid": raw} - raise TypeError( - f"Expected uuid.UUID or compatible UUID object, got {type(value)}" - ) - - def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> _uuid_module.UUID: - """Convert a struct dictionary back to a ``uuid.UUID`` instance. - - Args: - struct_dict: Dict with a ``"uuid"`` key containing 16 raw bytes - (``bytes`` or ``bytearray``). - - Returns: - A ``uuid.UUID`` constructed from the raw bytes. - - Raises: - ValueError: If the ``"uuid"`` key is absent from ``struct_dict``. - """ - raw = struct_dict.get("uuid") - if raw is None: - raise ValueError("Missing 'uuid' field in struct dict") - return _uuid_module.UUID(bytes=bytes(raw)) - - def can_handle_python_type(self, python_type: type) -> bool: - """Check if this converter can handle the given Python type. - - Args: - python_type: The Python type to check. - - Returns: - ``True`` if ``python_type`` is ``uuid.UUID`` or a subclass of it. - """ - return issubclass(python_type, self._python_type) - - def can_handle_struct_type(self, struct_type: "pa.StructType") -> bool: - """Check if this converter can handle the given Arrow struct type. - - Args: - struct_type: The Arrow struct type to check. - - Returns: - ``True`` if ``struct_type`` equals the UUID Arrow struct type. - """ - return struct_type == self._arrow_struct_type - - def hash_struct_dict(self, struct_dict: dict[str, Any]) -> str: - """Compute a SHA-256 hash of the UUID from its struct dictionary representation. - - Hashes the raw 16 UUID bytes directly. - - Args: - struct_dict: Dict with a ``"uuid"`` key containing 16 raw bytes. - - Returns: - Hash string of the form ``"uuid:sha256:"``. - - Raises: - ValueError: If the ``"uuid"`` key is absent from ``struct_dict``. - """ - raw = struct_dict.get("uuid") - if raw is None: - raise ValueError("Missing 'uuid' field in struct dict") - content_hash = self._compute_content_hash(bytes(raw)) - return self._format_semantic_hash(content_hash) diff --git a/tests/test_hashing/test_file_hashing_consistency.py b/tests/test_hashing/test_file_hashing_consistency.py deleted file mode 100644 index 70412e9d..00000000 --- a/tests/test_hashing/test_file_hashing_consistency.py +++ /dev/null @@ -1,219 +0,0 @@ -""" -Integration tests verifying that file hashing is consistent across both paths: - -1. **Arrow hasher path**: SemanticArrowHasher processes an Arrow table containing a - path struct column → calls PythonPathStructConverter.hash_struct_dict → file_hasher. -2. **Semantic hasher path**: SemanticAwarePythonHasher hashes a Python Path object → - calls PathSemanticHasher.handle → file_hasher. - -Both paths must delegate to the same FileContentHasherProtocol so that identical -file content always produces identical hashes, regardless of entry point. -""" - -from pathlib import Path - -import pyarrow as pa -import pytest - -from orcapod.hashing.arrow_hashers import SemanticArrowHasher -from orcapod.hashing.file_hashers import BasicFileHasher -from orcapod.hashing.semantic_hashing.builtin_handlers import ( - register_builtin_python_type_semantic_hashers, -) -from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher -from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeSemanticHasherRegistry -from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry -from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter - - -# --------------------------------------------------------------------------- -# Shared fixtures -# --------------------------------------------------------------------------- - - -@pytest.fixture -def file_hasher(): - """Single file hasher instance shared by both paths.""" - return BasicFileHasher(algorithm="sha256") - - -@pytest.fixture -def path_converter(file_hasher): - return PythonPathStructConverter(file_hasher=file_hasher) - - -@pytest.fixture -def arrow_hasher(path_converter): - """SemanticArrowHasher wired with the shared file_hasher via PythonPathStructConverter.""" - registry = SemanticTypeRegistry() - registry.register_converter("path", path_converter) - return SemanticArrowHasher(semantic_registry=registry) - - -@pytest.fixture -def semantic_hasher(file_hasher): - """SemanticAwarePythonHasher wired with the shared file_hasher via PathSemanticHasher.""" - registry = PythonTypeSemanticHasherRegistry() - register_builtin_python_type_semantic_hashers(registry, file_hasher=file_hasher) - return SemanticAwarePythonHasher( - hasher_id="test_v1", type_semantic_hasher_registry=registry, strict=True - ) - - -# --------------------------------------------------------------------------- -# Arrow struct hasher: path column tests -# --------------------------------------------------------------------------- - - -class TestArrowStructPathHashing: - """Tests for file hashing through the Arrow hasher path.""" - - def test_same_content_different_paths_same_hash(self, arrow_hasher, tmp_path): - """Two distinct files with identical content produce the same table hash.""" - file1 = tmp_path / "a.txt" - file2 = tmp_path / "b.txt" - file1.write_text("identical content") - file2.write_text("identical content") - - table1 = pa.table( - {"file": [{"path": str(file1)}]}, - schema=pa.schema( - [pa.field("file", pa.struct([pa.field("path", pa.large_string())]))] - ), - ) - table2 = pa.table( - {"file": [{"path": str(file2)}]}, - schema=pa.schema( - [pa.field("file", pa.struct([pa.field("path", pa.large_string())]))] - ), - ) - - hash1 = arrow_hasher.hash_table(table1) - hash2 = arrow_hasher.hash_table(table2) - assert hash1.digest == hash2.digest - - def test_modified_content_different_hash(self, arrow_hasher, tmp_path): - """Same path with modified content between hashes yields different hash.""" - file = tmp_path / "mutable.txt" - file.write_text("version 1") - - schema = pa.schema( - [pa.field("file", pa.struct([pa.field("path", pa.large_string())]))] - ) - table_v1 = pa.table({"file": [{"path": str(file)}]}, schema=schema) - hash1 = arrow_hasher.hash_table(table_v1) - - file.write_text("version 2") - table_v2 = pa.table({"file": [{"path": str(file)}]}, schema=schema) - hash2 = arrow_hasher.hash_table(table_v2) - - assert hash1.digest != hash2.digest - - def test_different_content_different_hash(self, arrow_hasher, tmp_path): - """Two files with different content produce different table hashes.""" - file1 = tmp_path / "x.txt" - file2 = tmp_path / "y.txt" - file1.write_text("content A") - file2.write_text("content B") - - schema = pa.schema( - [pa.field("file", pa.struct([pa.field("path", pa.large_string())]))] - ) - table1 = pa.table({"file": [{"path": str(file1)}]}, schema=schema) - table2 = pa.table({"file": [{"path": str(file2)}]}, schema=schema) - - hash1 = arrow_hasher.hash_table(table1) - hash2 = arrow_hasher.hash_table(table2) - assert hash1.digest != hash2.digest - - -# --------------------------------------------------------------------------- -# Semantic hasher: Path object tests -# --------------------------------------------------------------------------- - - -class TestSemanticPathHashing: - """Tests for file hashing through the semantic hasher path.""" - - def test_same_content_different_paths_same_hash(self, semantic_hasher, tmp_path): - """Two distinct Path objects pointing to files with identical content.""" - file1 = tmp_path / "a.txt" - file2 = tmp_path / "b.txt" - file1.write_text("identical content") - file2.write_text("identical content") - - hash1 = semantic_hasher.hash_object(Path(file1)) - hash2 = semantic_hasher.hash_object(Path(file2)) - assert hash1.digest == hash2.digest - - def test_modified_content_different_hash(self, semantic_hasher, tmp_path): - """Same Path with modified content between hashes.""" - file = tmp_path / "mutable.txt" - file.write_text("version 1") - hash1 = semantic_hasher.hash_object(Path(file)) - - file.write_text("version 2") - hash2 = semantic_hasher.hash_object(Path(file)) - assert hash1.digest != hash2.digest - - def test_different_content_different_hash(self, semantic_hasher, tmp_path): - """Two Paths pointing to different content produce different hashes.""" - file1 = tmp_path / "x.txt" - file2 = tmp_path / "y.txt" - file1.write_text("content A") - file2.write_text("content B") - - hash1 = semantic_hasher.hash_object(Path(file1)) - hash2 = semantic_hasher.hash_object(Path(file2)) - assert hash1.digest != hash2.digest - - -# --------------------------------------------------------------------------- -# Cross-path consistency -# --------------------------------------------------------------------------- - - -class TestCrossPathConsistency: - """Verify that the arrow hasher and semantic hasher use the same file_hasher - and produce equivalent file content hashes for the same underlying file.""" - - def test_arrow_and_semantic_hash_same_file_content( - self, path_converter, semantic_hasher, file_hasher, tmp_path - ): - """The file content hash extracted by PythonPathStructConverter.hash_struct_dict - must embed the same digest as ContentHash produced by PathContentHandler.handle - (which the semantic hasher uses internally for Path objects). - - Both paths ultimately call file_hasher.hash_file(path), so the raw digest - must be identical. hash_struct_dict always returns the fully-prefixed form - "path:sha256:", so we strip the prefix when comparing. - """ - file = tmp_path / "shared.txt" - file.write_text("shared content for both paths") - - # Arrow path: PythonPathStructConverter.hash_struct_dict — always prefixed - arrow_hash = path_converter.hash_struct_dict({"path": str(file)}) - # Strip "path:sha256:" prefix to get the raw hex - arrow_hash_hex = arrow_hash.split(":")[-1] - - # Semantic path: file_hasher.hash_file directly (same as PathContentHandler) - semantic_content_hash = file_hasher.hash_file(file) - - assert arrow_hash_hex == semantic_content_hash.digest.hex() - - def test_arrow_and_semantic_same_content_two_files( - self, path_converter, file_hasher, tmp_path - ): - """Two files with identical content: arrow struct hash_struct_dict and - direct file_hasher.hash_file produce the same digest.""" - file1 = tmp_path / "file_arrow.txt" - file2 = tmp_path / "file_semantic.txt" - content = "same content for cross-path test" - file1.write_text(content) - file2.write_text(content) - - # hash_struct_dict always returns "path:sha256:" — strip prefix - arrow_hex = path_converter.hash_struct_dict({"path": str(file1)}).split(":")[-1] - semantic_hex = file_hasher.hash_file(file2).digest.hex() - - assert arrow_hex == semantic_hex diff --git a/tests/test_semantic_types/test_path_struct_converter.py b/tests/test_semantic_types/test_path_struct_converter.py deleted file mode 100644 index 740b0c16..00000000 --- a/tests/test_semantic_types/test_path_struct_converter.py +++ /dev/null @@ -1,132 +0,0 @@ -from pathlib import Path -from typing import cast - -import pytest - -from orcapod.hashing.file_hashers import BasicFileHasher -from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter - - -@pytest.fixture -def file_hasher(): - return BasicFileHasher(algorithm="sha256") - - -@pytest.fixture -def converter(file_hasher): - return PythonPathStructConverter(file_hasher=file_hasher) - - -def test_path_to_struct_and_back(converter): - path_obj = Path("/tmp/test.txt") - struct_dict = converter.python_to_struct_dict(path_obj) - assert struct_dict["path"] == str(path_obj) - restored = converter.struct_dict_to_python(struct_dict) - assert restored == path_obj - - -def test_path_to_struct_invalid_type(converter): - with pytest.raises(TypeError): - converter.python_to_struct_dict("not_a_path") # type: ignore - - -def test_struct_to_python_missing_field(converter): - with pytest.raises(ValueError): - converter.struct_dict_to_python({}) - - -def test_can_handle_python_type(converter): - assert converter.can_handle_python_type(Path) - assert not converter.can_handle_python_type(str) - - -def test_can_handle_struct_type(converter): - struct_type = converter.arrow_struct_type - assert converter.can_handle_struct_type(struct_type) - - # Should fail for wrong fields - class FakeField: - def __init__(self, name, type): - self.name = name - self.type = type - - class FakeStructType(list): - @property - def names(self): - return [f.name for f in self] - - pass - - import pyarrow as pa - - fake_struct = cast( - pa.StructType, FakeStructType([FakeField("wrong", struct_type[0].type)]) - ) - assert not converter.can_handle_struct_type(fake_struct) - - -def test_is_semantic_struct(converter): - assert converter.is_semantic_struct({"path": "/tmp/test.txt"}) - assert not converter.is_semantic_struct({"not_path": "value"}) - assert not converter.is_semantic_struct({"path": 123}) - - -def test_hash_struct_dict_file_not_found(converter, tmp_path): - struct_dict = {"path": str(tmp_path / "does_not_exist.txt")} - with pytest.raises(FileNotFoundError): - converter.hash_struct_dict(struct_dict) - - -def test_hash_struct_dict_is_directory(converter, tmp_path): - struct_dict = {"path": str(tmp_path)} - with pytest.raises(IsADirectoryError): - converter.hash_struct_dict(struct_dict) - - -def test_hash_struct_dict_content_based(converter, tmp_path): - """Two distinct files with identical content produce the same hash.""" - file1 = tmp_path / "file1.txt" - file2 = tmp_path / "file2.txt" - content = "identical content" - file1.write_text(content) - file2.write_text(content) - hash1 = converter.hash_struct_dict({"path": str(file1)}) - hash2 = converter.hash_struct_dict({"path": str(file2)}) - assert hash1 == hash2 - - -def test_hash_path_objects_content_based(converter, tmp_path): - """Round-trip through python_to_struct_dict then hash_struct_dict.""" - file1 = tmp_path / "fileA.txt" - file2 = tmp_path / "fileB.txt" - content = "same file content" - file1.write_text(content) - file2.write_text(content) - struct_dict1 = converter.python_to_struct_dict(Path(file1)) - struct_dict2 = converter.python_to_struct_dict(Path(file2)) - hash1 = converter.hash_struct_dict(struct_dict1) - hash2 = converter.hash_struct_dict(struct_dict2) - assert hash1 == hash2 - - -def test_hash_struct_dict_with_prefix(converter, tmp_path): - """Hash always starts with 'path:sha256:'.""" - file = tmp_path / "file.txt" - file.write_text("hello") - hash_str = converter.hash_struct_dict({"path": str(file)}) - assert hash_str.startswith("path:sha256:") - - -def test_hash_struct_dict_different_content(converter, tmp_path): - """Same path with modified content yields a different hash.""" - file = tmp_path / "mutable.txt" - file.write_text("version 1") - hash1 = converter.hash_struct_dict({"path": str(file)}) - file.write_text("version 2") - hash2 = converter.hash_struct_dict({"path": str(file)}) - assert hash1 != hash2 - - -def test_hash_struct_dict_missing_path_field(converter): - with pytest.raises(ValueError, match="Missing 'path' field"): - converter.hash_struct_dict({}) diff --git a/tests/test_semantic_types/test_pydata_utils.py b/tests/test_semantic_types/test_pydata_utils.py deleted file mode 100644 index d9716866..00000000 --- a/tests/test_semantic_types/test_pydata_utils.py +++ /dev/null @@ -1,136 +0,0 @@ -from pathlib import Path, PosixPath -from typing import Any - -import pytest - -from orcapod.semantic_types import pydata_utils - - -def test_pylist_to_pydict_typical(): - data = [{"a": 1, "b": 2}, {"a": 3, "c": 4}] - result = pydata_utils.pylist_to_pydict(data) - assert result == {"a": [1, 3], "b": [2, None], "c": [None, 4]} - - -def test_pylist_to_pydict_missing_keys(): - data = [{"a": 1}, {"b": 2}, {"a": 3, "b": 4}] - result = pydata_utils.pylist_to_pydict(data) - assert result == {"a": [1, None, 3], "b": [None, 2, 4]} - - -def test_pylist_to_pydict_empty(): - assert pydata_utils.pylist_to_pydict([]) == {} - - -def test_pylist_to_pydict_empty_dicts(): - data = [{}, {}, {}] - assert pydata_utils.pylist_to_pydict(data) == {} - - -def test_pydict_to_pylist_typical(): - data = {"a": [1, 3], "b": [2, None], "c": [None, 4]} - result = pydata_utils.pydict_to_pylist(data) - assert result == [{"a": 1, "b": 2, "c": None}, {"a": 3, "b": None, "c": 4}] - - -def test_pydict_to_pylist_uneven_lengths(): - data = {"a": [1, 2], "b": [3]} - with pytest.raises(ValueError): - pydata_utils.pydict_to_pylist(data) - - -def test_pydict_to_pylist_empty(): - assert pydata_utils.pydict_to_pylist({}) == [] - - -def test_pydict_to_pylist_empty_lists(): - data = {"a": [], "b": []} - assert pydata_utils.pydict_to_pylist(data) == [] - - -def test_infer_python_schema_from_pylist_data_typical(): - data = [{"a": 1, "b": 2.0}, {"a": 3, "b": None}] - schema = pydata_utils.infer_python_schema_from_pylist_data(data) - assert schema["a"] in (int, int | None) - assert schema["b"] in (float | None, float) - - -def test_infer_python_schema_from_pylist_data_complex(): - data = [ - {"path": Path("/tmp/file1"), "size": 123}, - {"path": Path("/tmp/file2"), "size": None}, - ] - schema = pydata_utils.infer_python_schema_from_pylist_data(data) - assert schema["path"] in (Path, PosixPath) - assert schema["size"] == int | None - - -def test_infer_python_schema_from_pylist_data_empty(): - assert pydata_utils.infer_python_schema_from_pylist_data([]) == {} - - -def test_infer_python_schema_from_pylist_data_mixed_types(): - data = [{"a": 1}, {"a": "x"}, {"a": 2.5}] - schema = pydata_utils.infer_python_schema_from_pylist_data(data) - # Should be Union[int, float, str] or Any - assert "a" in schema - - -def test_infer_python_schema_from_pydict_data_typical(): - data = {"a": [1, 2], "b": [None, 3.5]} - schema = pydata_utils.infer_python_schema_from_pydict_data(data) - assert schema["a"] in (int, int | None) - assert schema["b"] in (float | None, float) - - -def test_infer_python_schema_from_pydict_data_empty(): - assert pydata_utils.infer_python_schema_from_pydict_data({}) == {} - - -def test_infer_python_schema_from_pydict_data_empty_lists(): - data = {"a": [], "b": []} - schema = pydata_utils.infer_python_schema_from_pydict_data(data) - assert schema["a"] == str | None - assert schema["b"] == str | None - - -def test_infer_python_schema_from_pydict_data_mixed_types(): - data = {"a": [1, "x", 2.5]} - schema = pydata_utils.infer_python_schema_from_pydict_data(data) - assert "a" in schema - - -def test_round_trip_pylist_pydict(): - data = [{"a": 1, "b": 2}, {"a": 3, "c": 4}] - pydict = pydata_utils.pylist_to_pydict(data) - pylist = pydata_utils.pydict_to_pylist(pydict) - # Should be equivalent to original data (order of keys may differ) - for orig, roundtrip in zip(data, pylist): - # Compare dicts for value equality, ignoring key order and missing keys - for k in orig: - assert orig[k] == roundtrip[k] - - -def test_round_trip_pydict_pylist(): - data = {"a": [1, 3], "b": [2, None], "c": [None, 4]} - pylist = pydata_utils.pydict_to_pylist(data) - pydict = pydata_utils.pylist_to_pydict(pylist) - for k in data: - assert pydict[k] == data[k] - - -# --------------------------------------------------------------------------- -# ENG-389: empty container inference produces list[Any] / dict[Any, Any] -# --------------------------------------------------------------------------- - - -def test_infer_empty_list_schema(): - """A field whose only value is [] infers as list[Any].""" - schema = pydata_utils.infer_python_schema_from_pylist_data([{"items": []}]) - assert schema["items"] == list[Any] - - -def test_infer_empty_dict_schema(): - """A field whose only value is {} infers as dict[Any, Any].""" - schema = pydata_utils.infer_python_schema_from_pylist_data([{"meta": {}}]) - assert schema["meta"] == dict[Any, Any] diff --git a/tests/test_semantic_types/test_schema_arrow_equality.py b/tests/test_semantic_types/test_schema_arrow_equality.py deleted file mode 100644 index cc04e141..00000000 --- a/tests/test_semantic_types/test_schema_arrow_equality.py +++ /dev/null @@ -1,324 +0,0 @@ -""" -Tests verifying Schema ↔ Arrow logical equality (PLT-923). - -Coverage --------- -- Python-equal schemas produce logically equal Arrow schemas -- Python-unequal schemas produce logically unequal Arrow schemas -- Field insertion order does not affect logical equality -- Nullability correspondence: T | None → nullable=True, plain T → nullable=False -- Round-trip: python_schema_to_arrow_schema ∘ arrow_schema_to_python_schema is lossless -- Nested/complex types maintain the correspondence -- Schema.as_required() strips optional_fields for Arrow-level comparison - -"Logical equality" is determined by StarfixArrowHasher.hash_schema digest equality: -column-order-independent, Utf8/LargeUtf8 and Binary/LargeBinary normalised, -nullability-sensitive. -""" - -from __future__ import annotations - -from pathlib import Path - -import pyarrow as pa - -from orcapod.contexts import get_default_context -from orcapod.hashing.arrow_hashers import StarfixArrowHasher -from orcapod.semantic_types import SemanticTypeRegistry -from orcapod.types import Schema - -# --------------------------------------------------------------------------- -# Shared infrastructure -# --------------------------------------------------------------------------- - -# SemanticTypeRegistry is empty: hash_schema operates on Arrow types only and -# never consults the semantic registry (unlike hash_table). -_hasher = StarfixArrowHasher(SemanticTypeRegistry(), hasher_id="test") - - -def _to_arrow(schema: Schema) -> pa.Schema: - """Convert a Python Schema to an Arrow schema via the default context.""" - return get_default_context().type_converter.python_schema_to_arrow_schema(schema) - - -def _arrow_logical_eq(s1: pa.Schema, s2: pa.Schema) -> bool: - """Return True if two Arrow schemas are logically equal under the starfix hash.""" - return _hasher.hash_schema(s1).digest == _hasher.hash_schema(s2).digest - - -# --------------------------------------------------------------------------- -# Positive: equal Python schemas → logically equal Arrow schemas -# --------------------------------------------------------------------------- - - -class TestEqualSchemasHaveLogicallyEqualArrowSchemas: - def test_single_int_field(self): - s1 = Schema(a=int) - s2 = Schema(a=int) - assert s1 == s2 - assert _arrow_logical_eq(_to_arrow(s1), _to_arrow(s2)) - - def test_single_float_field(self): - s1 = Schema(a=float) - s2 = Schema(a=float) - assert _arrow_logical_eq(_to_arrow(s1), _to_arrow(s2)) - - def test_single_str_field(self): - s1 = Schema(a=str) - s2 = Schema(a=str) - assert _arrow_logical_eq(_to_arrow(s1), _to_arrow(s2)) - - def test_single_bool_field(self): - s1 = Schema(a=bool) - s2 = Schema(a=bool) - assert _arrow_logical_eq(_to_arrow(s1), _to_arrow(s2)) - - def test_single_bytes_field(self): - s1 = Schema(a=bytes) - s2 = Schema(a=bytes) - assert _arrow_logical_eq(_to_arrow(s1), _to_arrow(s2)) - - def test_multiple_primitive_fields(self): - s1 = Schema({"a": int, "b": float, "c": str}) - s2 = Schema({"a": int, "b": float, "c": str}) - assert s1 == s2 - assert _arrow_logical_eq(_to_arrow(s1), _to_arrow(s2)) - - def test_kwargs_vs_mapping_construction(self): - """Schema(a=int, b=str) must equal Schema({"a": int, "b": str}).""" - s_kwargs = Schema(a=int, b=str) - s_mapping = Schema({"a": int, "b": str}) - assert s_kwargs == s_mapping - assert _arrow_logical_eq(_to_arrow(s_kwargs), _to_arrow(s_mapping)) - - def test_empty_schema(self): - s1 = Schema.empty() - s2 = Schema({}) - assert s1 == s2 - assert _arrow_logical_eq(_to_arrow(s1), _to_arrow(s2)) - - def test_schema_equals_plain_dict(self): - """Schema.__eq__ accepts plain Mapping; dict → Arrow conversion must match.""" - s = Schema({"x": int}) - d = {"x": int} - # Schema.__eq__ raises NotImplementedError for non-Mapping non-Schema; plain - # dict is a Mapping so this should work. - assert s == d - assert _arrow_logical_eq( - _to_arrow(s), - get_default_context().type_converter.python_schema_to_arrow_schema(d), - ) - - -# --------------------------------------------------------------------------- -# Negative: unequal Python schemas → logically unequal Arrow schemas -# --------------------------------------------------------------------------- - - -class TestUnequalSchemasHaveLogicallyUnequalArrowSchemas: - def test_different_field_names(self): - s1 = Schema(a=int) - s2 = Schema(b=int) - assert s1 != s2 - assert not _arrow_logical_eq(_to_arrow(s1), _to_arrow(s2)) - - def test_different_field_types(self): - s1 = Schema(a=int) - s2 = Schema(a=float) - assert s1 != s2 - assert not _arrow_logical_eq(_to_arrow(s1), _to_arrow(s2)) - - def test_subset_schema_differs(self): - s1 = Schema({"a": int, "b": str}) - s2 = Schema({"a": int}) - assert s1 != s2 - assert not _arrow_logical_eq(_to_arrow(s1), _to_arrow(s2)) - - -# --------------------------------------------------------------------------- -# Field ordering -# --------------------------------------------------------------------------- - - -class TestFieldOrderingDoesNotAffectLogicalEquality: - def test_two_fields_reversed_insertion_order(self): - """Both Python equality and Arrow logical equality are order-insensitive.""" - s1 = Schema({"a": int, "b": str}) - s2 = Schema({"b": str, "a": int}) - assert s1 == s2 - assert _arrow_logical_eq(_to_arrow(s1), _to_arrow(s2)) - - def test_three_fields_permuted_order(self): - s1 = Schema({"x": int, "y": float, "z": str}) - s2 = Schema({"z": str, "x": int, "y": float}) - assert s1 == s2 - assert _arrow_logical_eq(_to_arrow(s1), _to_arrow(s2)) - - -# --------------------------------------------------------------------------- -# Nullability correspondence -# --------------------------------------------------------------------------- - - -class TestNullabilityCorrespondence: - def test_plain_int_is_non_nullable(self): - arrow = _to_arrow(Schema(a=int)) - assert arrow.field("a").nullable is False - - def test_optional_int_is_nullable(self): - arrow = _to_arrow(Schema({"a": int | None})) - assert arrow.field("a").nullable is True - - def test_plain_primitives_all_non_nullable(self): - arrow = _to_arrow(Schema({"a": str, "b": float, "c": bool, "d": bytes})) - for name in ("a", "b", "c", "d"): - assert arrow.field(name).nullable is False, ( - f"Expected {name} to be non-nullable" - ) - - def test_optional_primitives_all_nullable(self): - arrow = _to_arrow(Schema({"a": str | None, "b": float | None})) - assert arrow.field("a").nullable is True - assert arrow.field("b").nullable is True - - def test_int_and_optional_int_are_python_unequal(self): - assert Schema(a=int) != Schema({"a": int | None}) - - def test_int_and_optional_int_are_arrow_logically_unequal(self): - s_plain = Schema(a=int) - s_optional = Schema({"a": int | None}) - assert not _arrow_logical_eq(_to_arrow(s_plain), _to_arrow(s_optional)) - - -# --------------------------------------------------------------------------- -# Round-trip: Python → Arrow → Python -# --------------------------------------------------------------------------- - - -class TestRoundTrip: - def _round_trip(self, schema: Schema) -> Schema: - converter = get_default_context().type_converter - return converter.arrow_schema_to_python_schema( - converter.python_schema_to_arrow_schema(schema) - ) - - def test_int_stays_int(self): - result = self._round_trip(Schema(a=int)) - assert result["a"] == int - - def test_optional_int_stays_optional_int(self): - result = self._round_trip(Schema({"a": int | None})) - assert result["a"] == int | None - - def test_plain_str_stays_str(self): - result = self._round_trip(Schema(a=str)) - assert result["a"] == str - - def test_optional_str_stays_optional_str(self): - result = self._round_trip(Schema({"a": str | None})) - assert result["a"] == str | None - - def test_plain_float_stays_float(self): - result = self._round_trip(Schema(a=float)) - assert result["a"] == float - - def test_plain_bool_stays_bool(self): - result = self._round_trip(Schema(a=bool)) - assert result["a"] == bool - - def test_plain_bytes_stays_bytes(self): - result = self._round_trip(Schema(a=bytes)) - assert result["a"] == bytes - - def test_optional_float_stays_optional_float(self): - result = self._round_trip(Schema({"a": float | None})) - assert result["a"] == float | None - - def test_mixed_nullable_and_non_nullable(self): - original = Schema({"req": int, "opt": str | None, "also_req": float}) - result = self._round_trip(original) - assert result["req"] == int - assert result["opt"] == str | None - assert result["also_req"] == float - - -# --------------------------------------------------------------------------- -# Nested and complex types -# --------------------------------------------------------------------------- - - -class TestNestedAndComplexTypes: - def test_list_int_is_non_nullable(self): - arrow = _to_arrow(Schema({"a": list[int]})) - assert arrow.field("a").nullable is False - - def test_list_str_is_non_nullable(self): - arrow = _to_arrow(Schema({"a": list[str]})) - assert arrow.field("a").nullable is False - - def test_optional_list_int_is_nullable(self): - arrow = _to_arrow(Schema({"a": list[int] | None})) - assert arrow.field("a").nullable is True - - def test_nested_list_is_non_nullable(self): - arrow = _to_arrow(Schema({"a": list[list[int]]})) - assert arrow.field("a").nullable is False - - def test_path_is_non_nullable(self): - """Path → Arrow extension type (pathlib.Path), nullable=False.""" - arrow = _to_arrow(Schema({"p": Path})) - assert arrow.field("p").nullable is False - assert isinstance(arrow.field("p").type, pa.ExtensionType) - assert arrow.field("p").type.extension_name == "orcapod.path" - - def test_equal_list_schemas_are_logically_equal(self): - s1 = Schema({"items": list[int]}) - s2 = Schema({"items": list[int]}) - assert _arrow_logical_eq(_to_arrow(s1), _to_arrow(s2)) - - def test_list_int_and_list_str_are_logically_unequal(self): - s1 = Schema({"items": list[int]}) - s2 = Schema({"items": list[str]}) - assert not _arrow_logical_eq(_to_arrow(s1), _to_arrow(s2)) - - -# --------------------------------------------------------------------------- -# Schema.as_required() -# --------------------------------------------------------------------------- - - -class TestAsRequired: - def test_as_required_equals_schema_without_optional_fields(self): - """Schema with optional_fields equals a Schema without after as_required().""" - s_with_optional = Schema({"a": int, "b": str}, optional_fields=["b"]) - s_without = Schema({"a": int, "b": str}) - assert s_with_optional.as_required() == s_without - - def test_as_required_on_schema_without_optional_is_noop(self): - """as_required() on a fully required schema is idempotent.""" - s = Schema({"a": int, "b": str}) - assert s.as_required() == s - - def test_as_required_idempotent(self): - """Calling as_required() twice gives the same result as once.""" - s = Schema({"a": int}, optional_fields=["a"]) - assert s.as_required().as_required() == s.as_required() - - def test_schemas_differing_only_in_optional_fields_are_python_unequal(self): - """Two schemas with the same fields but different optional_fields are unequal.""" - s1 = Schema({"a": int, "b": str}, optional_fields=["b"]) - s2 = Schema({"a": int, "b": str}) - assert s1 != s2 - - def test_schemas_differing_only_in_optional_fields_have_equal_arrow_schemas(self): - """optional_fields has no Arrow representation — Arrow schemas must be equal.""" - s1 = Schema({"a": int, "b": str}, optional_fields=["b"]) - s2 = Schema({"a": int, "b": str}) - assert _arrow_logical_eq(_to_arrow(s1), _to_arrow(s2)) - - def test_as_required_implies_arrow_logical_equality(self): - """If s1.as_required() == s2.as_required(), their Arrow schemas are logically equal.""" - s1 = Schema({"x": int, "y": float}, optional_fields=["x"]) - s2 = Schema({"x": int, "y": float}) - assert s1.as_required() == s2.as_required() - assert _arrow_logical_eq(_to_arrow(s1), _to_arrow(s2)) diff --git a/tests/test_semantic_types/test_semantic_registry.py b/tests/test_semantic_types/test_semantic_registry.py deleted file mode 100644 index fd044ff5..00000000 --- a/tests/test_semantic_types/test_semantic_registry.py +++ /dev/null @@ -1,235 +0,0 @@ -import uuid -from unittest.mock import Mock - -import pyarrow as pa -import pytest - -from orcapod.semantic_types import semantic_registry - - -def test_registry_initialization(): - registry = semantic_registry.SemanticTypeRegistry() - assert registry.list_semantic_types() == [] - assert registry.list_python_types() == [] - assert registry.list_struct_signatures() == [] - - -def test_register_and_retrieve_converter(): - registry = semantic_registry.SemanticTypeRegistry() - python_type = Mock(name="PythonType") - struct_type = Mock(name="StructType") - converter = Mock() - converter.python_type = python_type - converter.arrow_struct_type = struct_type - registry.register_converter("mock_type", converter) - # Retrieve by semantic type name - assert registry.get_converter_for_semantic_type("mock_type") is converter - # Retrieve by python type - assert registry.get_converter_for_python_type(python_type) is converter - # Retrieve by struct signature - assert registry.get_converter_for_struct_signature(struct_type) is converter - - -def test_register_duplicate_semantic_type_raises(): - registry = semantic_registry.SemanticTypeRegistry() - python_type = Mock(name="PythonType") - struct_type = Mock(name="StructType") - converter1 = Mock() - converter1.python_type = python_type - converter1.arrow_struct_type = struct_type - registry.register_converter("mock_type", converter1) - converter2 = Mock() - converter2.python_type = python_type - converter2.arrow_struct_type = struct_type - with pytest.raises(ValueError): - registry.register_converter("mock_type", converter2) - - -def test_register_conflicting_python_type_raises(): - registry = semantic_registry.SemanticTypeRegistry() - python_type = Mock(name="PythonType") - struct_type1 = Mock(name="StructType1") - struct_type2 = Mock(name="StructType2") - converter1 = Mock() - converter1.python_type = python_type - converter1.arrow_struct_type = struct_type1 - registry.register_converter("mock_type1", converter1) - converter2 = Mock() - converter2.python_type = python_type - converter2.arrow_struct_type = struct_type2 - with pytest.raises(ValueError): - registry.register_converter("mock_type2", converter2) - - -def test_register_conflicting_struct_signature_raises(): - registry = semantic_registry.SemanticTypeRegistry() - python_type1 = Mock(name="PythonType1") - python_type2 = Mock(name="PythonType2") - struct_type = Mock(name="StructType") - converter1 = Mock() - converter1.python_type = python_type1 - converter1.arrow_struct_type = struct_type - registry.register_converter("mock_type1", converter1) - converter2 = Mock() - converter2.python_type = python_type2 - converter2.arrow_struct_type = struct_type - with pytest.raises(ValueError): - registry.register_converter("mock_type2", converter2) - - -def test_get_nonexistent_returns_none(): - registry = semantic_registry.SemanticTypeRegistry() - python_type = Mock(name="PythonType") - struct_type = Mock(name="StructType") - assert registry.get_converter_for_semantic_type("not_present") is None - assert registry.get_converter_for_python_type(python_type) is None - assert registry.get_converter_for_struct_signature(struct_type) is None - - -def test_list_registered_types(): - registry = semantic_registry.SemanticTypeRegistry() - python_type1 = Mock(name="PythonType1") - struct_type1 = Mock(name="StructType1") - converter1 = Mock() - converter1.python_type = python_type1 - converter1.arrow_struct_type = struct_type1 - registry.register_converter("mock_type1", converter1) - - python_type2 = Mock(name="PythonType2") - struct_type2 = Mock(name="StructType2") - converter2 = Mock() - converter2.python_type = python_type2 - converter2.arrow_struct_type = struct_type2 - registry.register_converter("mock_type2", converter2) - - assert set(registry.list_semantic_types()) == {"mock_type1", "mock_type2"} - assert set(registry.list_python_types()) == {python_type1, python_type2} - assert set(registry.list_struct_signatures()) == {struct_type1, struct_type2} - - -def test_has_methods(): - registry = semantic_registry.SemanticTypeRegistry() - python_type = Mock(name="PythonType") - struct_type = Mock(name="StructType") - converter = Mock() - converter.python_type = python_type - converter.arrow_struct_type = struct_type - registry.register_converter("mock_type", converter) - assert registry.has_semantic_type("mock_type") - assert registry.has_python_type(python_type) - assert registry.has_semantic_struct_signature(struct_type) - - -def test_integration_with_converter(): - registry = semantic_registry.SemanticTypeRegistry() - python_type = Mock(name="PythonType") - struct_type = Mock(name="StructType") - converter = Mock() - converter.python_type = python_type - converter.arrow_struct_type = struct_type - registry.register_converter("mock_type", converter) - retrieved = registry.get_converter_for_semantic_type("mock_type") - assert retrieved is converter - - -def test_uuid_type_registered_in_default_context(): - """uuid.UUID should be registered as an Arrow extension type in the default context.""" - from orcapod.contexts import get_default_context - - ctx = get_default_context() - arrow_type = ctx.type_converter.register_python_class(uuid.UUID) - assert isinstance(arrow_type, pa.ExtensionType), ( - "uuid.UUID must be registered as an Arrow extension type" - ) - - -def test_uuid_extension_type_resolves_to_python_type(): - """The Arrow extension type for UUID should resolve back to uuid.UUID.""" - from orcapod.contexts import get_default_context - - ctx = get_default_context() - arrow_type = ctx.type_converter.register_python_class(uuid.UUID) - python_type = ctx.type_converter.arrow_type_to_python_type(arrow_type) - assert python_type is uuid.UUID - - -def test_uuid_extension_name(): - """The UUID extension type should have the expected extension name.""" - from orcapod.contexts import get_default_context - - ctx = get_default_context() - arrow_type = ctx.type_converter.register_python_class(uuid.UUID) - assert isinstance(arrow_type, pa.ExtensionType) - assert "uuid" in arrow_type.extension_name.lower() - - -# Comprehensive unregister tests for future implementation -# Uncomment when unregister methods are implemented -# -# def test_unregister_by_semantic_type_name(): -# registry = semantic_registry.SemanticTypeRegistry() -# python_type = Mock(name="PythonType") -# struct_type = Mock(name="StructType") -# converter = Mock() -# converter.python_type = python_type -# converter.arrow_struct_type = struct_type -# registry.register_converter("mock_type", converter) -# result = registry.unregister_by_semantic_type_name("mock_type") -# assert result == {"mock_type": converter} -# assert not registry.has_semantic_type("mock_type") -# assert not registry.has_python_type(python_type) -# assert not registry.has_semantic_struct_signature(struct_type) -# assert registry.get_converter_for_semantic_type("mock_type") is None -# assert registry.get_converter_for_python_type(python_type) is None -# assert registry.get_converter_for_struct_signature(struct_type) is None -# -# def test_unregister_by_converter(): -# registry = semantic_registry.SemanticTypeRegistry() -# python_type = Mock(name="PythonType") -# struct_type = Mock(name="StructType") -# converter = Mock() -# converter.python_type = python_type -# converter.arrow_struct_type = struct_type -# registry.register_converter("mock_type", converter) -# result = registry.unregister_by_converter(converter) -# assert result == {"mock_type": converter} -# assert not registry.has_semantic_type("mock_type") -# assert not registry.has_python_type(python_type) -# assert not registry.has_semantic_struct_signature(struct_type) -# assert registry.get_converter_for_semantic_type("mock_type") is None -# assert registry.get_converter_for_python_type(python_type) is None -# assert registry.get_converter_for_struct_signature(struct_type) is None -# -# def test_unregister_by_python_type(): -# registry = semantic_registry.SemanticTypeRegistry() -# python_type = Mock(name="PythonType") -# struct_type = Mock(name="StructType") -# converter = Mock() -# converter.python_type = python_type -# converter.arrow_struct_type = struct_type -# registry.register_converter("mock_type", converter) -# result = registry.unregister_by_python_type(python_type) -# assert result == {"mock_type": converter} -# assert not registry.has_semantic_type("mock_type") -# assert not registry.has_python_type(python_type) -# assert not registry.has_semantic_struct_signature(struct_type) -# assert registry.get_converter_for_semantic_type("mock_type") is None -# assert registry.get_converter_for_python_type(python_type) is None -# assert registry.get_converter_for_struct_signature(struct_type) is None -# -# def test_unregister_by_struct_signature(): -# registry = semantic_registry.SemanticTypeRegistry() -# python_type = Mock(name="PythonType") -# struct_type = Mock(name="StructType") -# converter = Mock() -# converter.python_type = python_type -# converter.arrow_struct_type = struct_type -# registry.register_converter("mock_type", converter) -# result = registry.unregister_by_struct_signature(struct_type) -# assert result == {"mock_type": converter} -# assert not registry.has_semantic_type("mock_type") -# assert not registry.has_python_type(python_type) -# assert not registry.has_semantic_struct_signature(struct_type) -# assert registry.get_converter_for_semantic_type("mock_type") is None -# assert registry.get_converter_for_python_type(python_type) is None -# assert registry.get_converter_for_struct_signature(struct_type) is None diff --git a/tests/test_semantic_types/test_semantic_struct_converters.py b/tests/test_semantic_types/test_semantic_struct_converters.py deleted file mode 100644 index 168f1a45..00000000 --- a/tests/test_semantic_types/test_semantic_struct_converters.py +++ /dev/null @@ -1,107 +0,0 @@ -from orcapod.semantic_types.semantic_struct_converters import ( - SemanticStructConverterBase, -) - - -class DummyConverter(SemanticStructConverterBase): - def __init__(self): - super().__init__("dummy") - self._python_type = dict - self._arrow_struct_type = "dummy_struct" - - @property - def python_type(self): - return self._python_type - - @property - def arrow_struct_type(self): - return self._arrow_struct_type - - def python_to_struct_dict(self, value): - return value - - def struct_dict_to_python(self, struct_dict): - return struct_dict - - def can_handle_python_type(self, python_type): - return python_type is dict - - def can_handle_struct_type(self, struct_type): - return struct_type == "dummy_struct" - - def is_semantic_struct(self, struct_dict): - return isinstance(struct_dict, dict) - - def hash_struct_dict(self, struct_dict): - return "dummyhash" - - -# --- SemanticStructConverterBase tests --- -def test_semantic_struct_converter_base_properties(): - converter = DummyConverter() - assert converter.semantic_type_name == "dummy" - assert converter.hasher_id == "dummy_content_sha256" - - - -def test_compute_content_hash(): - converter = DummyConverter() - data = b"abc" - result = converter._compute_content_hash(data) - import hashlib - - assert result.digest == hashlib.sha256(data).digest() - - -# --- PythonPathStructConverter tests --- - - -def test_extensibility_with_new_converter(): - class NewConverter(SemanticStructConverterBase): - def __init__(self): - super().__init__("newtype") - self._python_type = list - self._arrow_struct_type = "new_struct" - - @property - def python_type(self): - return self._python_type - - @property - def arrow_struct_type(self): - return self._arrow_struct_type - - def python_to_struct_dict(self, value): - return {"data": value} - - def struct_dict_to_python(self, struct_dict): - return struct_dict["data"] - - def can_handle_python_type(self, python_type): - return python_type is list - - def can_handle_struct_type(self, struct_type): - return struct_type == "new_struct" - - def is_semantic_struct(self, struct_dict): - return "data" in struct_dict - - def hash_struct_dict(self, struct_dict): - return "newhash" - - converter = NewConverter() - assert converter.semantic_type_name == "newtype" - assert converter.python_to_struct_dict([1, 2, 3]) == {"data": [1, 2, 3]} - assert converter.struct_dict_to_python({"data": [1, 2, 3]}) == [1, 2, 3] - assert converter.can_handle_python_type(list) - assert converter.can_handle_struct_type("new_struct") - assert converter.is_semantic_struct({"data": [1, 2, 3]}) - assert converter.hash_struct_dict({"data": [1, 2, 3]}) == "newhash" - - -# --- Edge cases --- -def test_dummy_converter_edge_cases(): - converter = DummyConverter() - assert converter.is_semantic_struct({}) - assert not converter.is_semantic_struct(None) - assert converter.hash_struct_dict({}) == "dummyhash" diff --git a/tests/test_semantic_types/test_universal_converter.py b/tests/test_semantic_types/test_universal_converter.py deleted file mode 100644 index d8032ed0..00000000 --- a/tests/test_semantic_types/test_universal_converter.py +++ /dev/null @@ -1,1029 +0,0 @@ -import uuid as _uuid_module -from datetime import datetime, timezone -from pathlib import Path -from typing import Any, cast - -import numpy as np -import polars as pl -import pyarrow as pa -import pytest - -from orcapod.contexts import get_default_context -from orcapod.extension_types.registry import ( - LogicalTypeRegistry, - make_arrow_extension_type, -) -from orcapod.semantic_types import universal_converter -from orcapod.semantic_types.universal_converter import UniversalTypeConverter - - -def test_python_type_to_arrow_type_basic(): - assert universal_converter.python_type_to_arrow_type(int) == pa.int64() - assert universal_converter.python_type_to_arrow_type(float) == pa.float64() - assert universal_converter.python_type_to_arrow_type(str) == pa.large_string() - assert universal_converter.python_type_to_arrow_type(bool) == pa.bool_() - assert universal_converter.python_type_to_arrow_type(bytes) == pa.large_binary() - - -def test_python_type_to_arrow_type_datetime(): - assert universal_converter.python_type_to_arrow_type(datetime) == pa.timestamp( - "us", tz="UTC" - ) - - -def test_arrow_type_to_python_type_timestamp_with_tz(): - assert ( - universal_converter.arrow_type_to_python_type(pa.timestamp("us", tz="UTC")) - is datetime - ) - - -def test_arrow_type_to_python_type_timestamp_no_tz(): - assert universal_converter.arrow_type_to_python_type(pa.timestamp("us")) is datetime - - -def test_datetime_converter_rejects_naive(): - to_arrow, _ = universal_converter.get_conversion_functions(datetime) - naive = datetime(2024, 1, 15, 12, 30, 45, 123456) # no tzinfo - with pytest.raises(ValueError, match="Naive datetime"): - to_arrow(naive) - - -def test_datetime_converter_rejects_stub_tzinfo(): - """Rejects datetimes whose tzinfo.utcoffset() returns None (effectively naive).""" - import datetime as dt_mod - - class StubTzInfo(dt_mod.tzinfo): - def utcoffset(self, d): - return None # technically set but semantically naive - - def tzname(self, d): - return "Stub" - - def dst(self, d): - return None - - to_arrow, _ = universal_converter.get_conversion_functions(datetime) - stub_aware = datetime(2024, 1, 15, 12, 30, 45, tzinfo=StubTzInfo()) - with pytest.raises(ValueError, match="Naive datetime"): - to_arrow(stub_aware) - - -def test_datetime_converter_accepts_aware(): - to_arrow, _ = universal_converter.get_conversion_functions(datetime) - aware = datetime(2024, 1, 15, 12, 30, 45, 123456, tzinfo=timezone.utc) - result = to_arrow(aware) - assert result == aware - - -def test_datetime_converter_accepts_non_utc_aware(): - """Non-UTC timezone-aware datetimes pass through the converter unchanged. - - PyArrow normalises the value to UTC when writing to a pa.timestamp("us", tz="UTC") - column; the converter itself does not normalise — it only enforces the timezone - policy for naive datetimes. - """ - import zoneinfo - - to_arrow, _ = universal_converter.get_conversion_functions(datetime) - eastern = zoneinfo.ZoneInfo("America/New_York") - non_utc = datetime(2024, 1, 15, 12, 30, 45, tzinfo=eastern) - result = to_arrow(non_utc) - assert result == non_utc # converter passes through unchanged - - -def test_datetime_converter_passes_none_through(): - """None passes through the datetime converter unchanged (PyArrow enforces nullability).""" - to_arrow, _ = universal_converter.get_conversion_functions(datetime) - assert to_arrow(None) is None - - -def test_tz_less_arrow_timestamp_reads_as_naive(): - """Reading a tz-less Arrow timestamp column produces naive (timezone-less) datetimes. - - PyArrow's ``.as_py()`` on a tz-less timestamp returns a naive datetime. The - converter passes it through unchanged — no UTC attachment. To write these values - back via the converter use the ``"coerce_utc"`` timezone policy, or attach timezone - info manually before calling ``python_dicts_to_arrow_table``. - """ - converter = get_default_context().type_converter - naive_ts = datetime(2024, 5, 1, 9, 0, 0) - table = pa.table({"ts": pa.array([naive_ts], type=pa.timestamp("us"))}) - - rows_out = converter.arrow_table_to_python_dicts(table) - result = rows_out[0]["ts"] - - assert result.tzinfo is None - assert result == datetime(2024, 5, 1, 9, 0, 0) - - -def test_datetime_coerce_utc_converts_naive(): - """coerce_utc policy attaches timezone.utc to naive datetimes instead of raising.""" - converter = UniversalTypeConverter(datetime_timezone="coerce_utc") - to_arrow = converter.get_python_to_arrow_converter(datetime) - naive = datetime(2024, 1, 15, 12, 30, 45, 123456) - result = to_arrow(naive) - assert result == datetime(2024, 1, 15, 12, 30, 45, 123456, tzinfo=timezone.utc) - - -def test_datetime_coerce_utc_preserves_aware(): - """coerce_utc policy leaves already-aware datetimes unchanged.""" - converter = UniversalTypeConverter(datetime_timezone="coerce_utc") - to_arrow = converter.get_python_to_arrow_converter(datetime) - aware = datetime(2024, 1, 15, 12, 30, 45, 123456, tzinfo=timezone.utc) - result = to_arrow(aware) - assert result == aware - - -def test_datetime_round_trip(): - converter = get_default_context().type_converter - ts = datetime(2024, 3, 15, 10, 30, 45, 123456, tzinfo=timezone.utc) - rows_in = [{"event": "launch", "ts": ts}] - - # No explicit schema — exercises schema inference from data (type(value) -> datetime) - table = converter.python_dicts_to_arrow_table(rows_in) - - # Arrow schema must use timestamp(us, UTC) and be non-nullable for a plain datetime field - assert table.schema.field("ts").type == pa.timestamp("us", tz="UTC") - assert table.schema.field("ts").nullable is False - - rows_out = converter.arrow_table_to_python_dicts(table) - assert len(rows_out) == 1 - assert rows_out[0]["event"] == "launch" - assert rows_out[0]["ts"] == ts - - -def test_optional_datetime_round_trip(): - converter = get_default_context().type_converter - ts = datetime(2024, 6, 1, 0, 0, 0, tzinfo=timezone.utc) - rows_in = [ - {"label": "a", "ts": ts}, - {"label": "b", "ts": None}, - ] - python_schema = {"label": str, "ts": datetime | None} - - table = converter.python_dicts_to_arrow_table(rows_in, python_schema=python_schema) - - assert table.schema.field("ts").type == pa.timestamp("us", tz="UTC") - assert table.schema.field("ts").nullable is True - - rows_out = converter.arrow_table_to_python_dicts(table) - assert rows_out[0]["ts"] == ts - assert rows_out[1]["ts"] is None - - -def test_python_type_to_arrow_type_numpy(): - assert universal_converter.python_type_to_arrow_type(np.int32) == pa.int32() - assert universal_converter.python_type_to_arrow_type(np.float64) == pa.float64() - assert universal_converter.python_type_to_arrow_type(np.bool_) == pa.bool_() - - -def test_python_type_to_arrow_type_custom(): - """Path converts to an Arrow extension type when the default LogicalTypeRegistry is wired in.""" - arrow_type = universal_converter.python_type_to_arrow_type(Path) - # Path is registered in the default logical_type_registry — expect an extension type. - assert isinstance(arrow_type, pa.ExtensionType) - assert arrow_type.extension_name == "orcapod.path" - assert pa.types.is_large_string(arrow_type.storage_type) - - -def test_python_type_to_arrow_type_upath(): - from upath import UPath - - arrow_type = universal_converter.python_type_to_arrow_type(UPath) - # UPath is registered in the default logical_type_registry — expect an extension type. - assert isinstance(arrow_type, pa.ExtensionType) - assert arrow_type.extension_name == "orcapod.upath" - assert pa.types.is_large_string(arrow_type.storage_type) - - -def test_optional_upath_converter(): - """Test that Optional[UPath] correctly converts UPath values via the LogicalTypeRegistry.""" - from upath import UPath - - to_arrow, to_python = universal_converter.get_conversion_functions(UPath | None) - - # UPath is registered — python_to_storage returns the string representation. - path = UPath("/tmp/test.txt") - result = to_arrow(path) - assert result == str(path) - - # Test with None - assert to_arrow(None) is None - - -def test_complex_union_raises_error(): - """Test that complex unions (multiple non-None types) raise ValueError.""" - from upath import UPath - - with pytest.raises(ValueError, match="Complex unions"): - universal_converter.get_conversion_functions(UPath | Path) - - with pytest.raises(ValueError, match="Complex unions"): - universal_converter.python_type_to_arrow_type(UPath | Path) - - -def test_python_type_to_arrow_type_context(): - ctx = get_default_context() - assert universal_converter.python_type_to_arrow_type(int, ctx) == pa.int64() - - -def test_python_type_to_arrow_type_unsupported(): - class CustomType: - pass - - with pytest.raises(Exception): - universal_converter.python_type_to_arrow_type(CustomType) - - -def test_arrow_type_to_python_type_basic(): - assert universal_converter.arrow_type_to_python_type(pa.int64()) is int - assert universal_converter.arrow_type_to_python_type(pa.float64()) is float - assert universal_converter.arrow_type_to_python_type(pa.large_string()) is str - assert universal_converter.arrow_type_to_python_type(pa.bool_()) is bool - assert universal_converter.arrow_type_to_python_type(pa.large_binary()) is bytes - - -def test_arrow_type_to_python_type_context(): - ctx = get_default_context() - assert universal_converter.arrow_type_to_python_type(pa.int64(), ctx) is int - - -def test_arrow_type_to_python_type_unsupported(): - class FakeArrowType: - pass - - with pytest.raises(Exception): - universal_converter.arrow_type_to_python_type( - cast(pa.DataType, FakeArrowType()) - ) - - -def test_get_conversion_functions_basic(): - to_arrow, to_python = universal_converter.get_conversion_functions(int) - assert callable(to_arrow) - assert callable(to_python) - assert to_arrow(42) == 42 - assert to_python(42) == 42 - - -def test_get_conversion_functions_custom(): - to_arrow, to_python = universal_converter.get_conversion_functions(str) - assert to_arrow("abc") == "abc" - assert to_python("abc") == "abc" - - -def test_get_conversion_functions_context(): - ctx = get_default_context() - to_arrow, to_python = universal_converter.get_conversion_functions(float, ctx) - assert to_arrow(1.5) == 1.5 - assert to_python(1.5) == 1.5 - - -def test_python_type_to_arrow_type_list(): - # Unparameterized list should raise ValueError - with pytest.raises(ValueError): - universal_converter.python_type_to_arrow_type(list) - - -def test_python_type_to_arrow_type_dict(): - # Unparameterized dict should raise ValueError - with pytest.raises(ValueError): - universal_converter.python_type_to_arrow_type(dict) - - -def test_python_type_to_arrow_type_list_of_dict(): - # For list[dict[str, int]], expect LargeListType of LargeListType of StructType - arrow_type = universal_converter.python_type_to_arrow_type(list[dict[str, int]]) - # Should be LargeListType - assert arrow_type.__class__.__name__.endswith("ListType") - # Next level should also be LargeListType - arrow_type = cast(pa.ListType, arrow_type) - inner_list = arrow_type.value_type - assert inner_list.__class__.__name__.endswith("ListType") - # Innermost should be StructType - struct_type = inner_list.value_type - assert isinstance(struct_type, pa.StructType) - assert struct_type[0].name == "key" - assert struct_type[0].type == pa.large_string() - assert struct_type[1].name == "value" - assert struct_type[1].type == pa.int64() - - -def test_python_type_to_arrow_type_dict_of_list(): - # dict[str, list[int]] should be a LargeListType of StructType, with value field as LargeListType - arrow_type = universal_converter.python_type_to_arrow_type(dict[str, list[int]]) - assert arrow_type.__class__.__name__.endswith("ListType") - arrow_type = cast(pa.ListType, arrow_type) - struct_type = arrow_type.value_type - assert isinstance(struct_type, pa.StructType) - assert struct_type[0].name == "key" - assert struct_type[0].type == pa.large_string() - assert struct_type[1].name == "value" - value_type = struct_type[1].type - assert value_type.__class__.__name__.endswith("ListType") - assert value_type.value_type == pa.int64() - - -def test_python_type_to_arrow_type_list_of_list(): - arrow_type = universal_converter.python_type_to_arrow_type(list[list[int]]) - assert arrow_type.__class__.__name__.endswith("ListType") - arrow_type = cast(pa.ListType, arrow_type) - inner_list = arrow_type.value_type - assert inner_list.__class__.__name__.endswith("ListType") - assert inner_list.value_type == pa.int64() - - -def test_python_type_to_arrow_type_deeply_nested(): - # dict[str, list[list[dict[str, float]]]] - complex_type = dict[str, list[list[dict[str, float]]]] - arrow_type = universal_converter.python_type_to_arrow_type(complex_type) - # Should be a LargeListType of StructType - assert arrow_type.__class__.__name__.endswith("ListType") - arrow_type = cast(pa.ListType, arrow_type) - struct_type = arrow_type.value_type - assert isinstance(struct_type, pa.StructType) - assert struct_type[0].name == "key" - assert struct_type[0].type == pa.large_string() - assert struct_type[1].name == "value" - outer_list = struct_type[1].type - assert outer_list.__class__.__name__.endswith("ListType") - inner_list = outer_list.value_type - assert inner_list.__class__.__name__.endswith("ListType") - inner_struct_list = inner_list.value_type - assert inner_struct_list.__class__.__name__.endswith("ListType") - inner_struct = inner_struct_list.value_type - assert isinstance(inner_struct, pa.StructType) - assert inner_struct[0].name == "key" - assert inner_struct[0].type == pa.large_string() - assert inner_struct[1].name == "value" - assert inner_struct[1].type == pa.float64() - - -# Roundtrip tests for complex types -def test_roundtrip_list_of_int(): - py_val = [1, 2, 3, 4] - to_arrow, to_python = universal_converter.get_conversion_functions(list[int]) - arr = to_arrow(py_val) - py_val2 = to_python(arr) - assert py_val == py_val2 - - -def test_roundtrip_dict_str_int(): - py_val = {"a": 1, "b": 2} - to_arrow, to_python = universal_converter.get_conversion_functions(dict[str, int]) - arr = to_arrow(py_val) - py_val2 = to_python(arr) - # dict roundtrip may come back as dict or list of pairs - if isinstance(py_val2, dict): - assert py_val == py_val2 - else: - # Accept list of pairs - assert sorted(py_val.items()) == sorted( - [(d["key"], d["value"]) for d in py_val2] - ) - - -def test_roundtrip_list_of_list_of_float(): - py_val = [[1.1, 2.2], [3.3, 4.4]] - to_arrow, to_python = universal_converter.get_conversion_functions( - list[list[float]] - ) - arr = to_arrow(py_val) - py_val2 = to_python(arr) - assert py_val == py_val2 - - -def test_roundtrip_set_of_int(): - py_val = {1, 2, 3} - to_arrow, to_python = universal_converter.get_conversion_functions(set[int]) - arr = to_arrow(py_val) - py_val2 = to_python(arr) - # set will come back as list - assert py_val != py_val2 - assert set(py_val) == set(py_val2) - - -def test_roundtrip_various_complex_types(): - cases = [ - ([1, 2, 3], list[int]), - ([["a", "b"], ["c"]], list[list[str]]), - ({"a": 1, "b": 2}, dict[str, int]), - ([{"x": 1.1, "y": 2.2}, {"x": 3.3, "y": 4.4}], list[dict[str, float]]), - ({"a": [1, 2], "b": [3]}, dict[str, list[int]]), - ( - [{"a": [1, 2]}, {"b": [3], "c": [4, 5, 6]}], - list[dict[str, list[int]]], - ), - ( - [[{"k": "a", "v": 1.1}, {"k": "b", "v": 2.2}], [{"k": "c", "v": 3.3}]], - list[list[dict[str, float]]], - ), - ( - {"outer": [{"inner": [1, 2]}, {"inner": [3, 4]}]}, - dict[str, list[dict[str, list[int]]]], - ), - ({"a": {"b": {"c": 42}}}, dict[str, dict[str, dict[str, int]]]), - ({"a": None, "b": 2}, dict[str, int]), - ( - [{"x": [1, 2], "y": [3, 4]}, {"x": [5], "y": [6, 7]}], - list[dict[str, list[int]]], - ), - ] - for py_val, typ in cases: - to_arrow, to_python = universal_converter.get_conversion_functions(typ) - arr = to_arrow(py_val) - py_val2 = to_python(arr) - assert py_val == py_val2, f"Failed roundtrip for type {typ} with value {py_val}" - - -def test_incomplete_roundtrip_types(): - cases = [({"a": {1, 2}, "b": {3}}, dict[str, set[int]], {"a": [1, 2], "b": [3]})] - - for py_val, typ, expected_return in cases: - to_arrow, to_python = universal_converter.get_conversion_functions(typ) - arr = to_arrow(py_val) - py_val2 = to_python(arr) - assert py_val2 == expected_return, ( - f"Failed roundtrip for type {typ} with value {py_val}" - ) - - -def test_roundtrip_minimal_key_list_issue(): - py_val = [{"test": [1, 2, 3], "next": [3, 4]}] - typ = list[dict[str, list[int]]] - to_arrow, to_python = universal_converter.get_conversion_functions(typ) - arr = to_arrow(py_val) - py_val2 = to_python(arr) - print("Original:", py_val) - print("Roundtrip:", py_val2) - assert py_val == py_val2 - - -def test_roundtrip_simpler_key_issue_dict_str_list(): - py_val = {"a": [1, 2]} - typ = dict[str, list[int]] - to_arrow, to_python = universal_converter.get_conversion_functions(typ) - arr = to_arrow(py_val) - py_val2 = to_python(arr) - print("Original dict[str, list[int]]:", py_val) - print("Roundtrip:", py_val2) - assert py_val == py_val2 - - -def test_roundtrip_simpler_key_issue_list_dict_str_int(): - py_val = [{"key": "a", "value": 1}] - typ = list[dict[str, int]] - to_arrow, to_python = universal_converter.get_conversion_functions(typ) - arr = to_arrow(py_val) - py_val2 = to_python(arr) - print("Original list[dict[str, int]]:", py_val) - print("Roundtrip:", py_val2) - assert py_val == py_val2 - - -def test_inspect_arrow_schema_dict_str_list(): - py_val = {"test": [1, 2]} - typ = dict[str, list[int]] - arrow_type = universal_converter.python_type_to_arrow_type(typ) - print("Arrow type for dict[str, list[int]]:", arrow_type) - to_arrow_struct, to_python = universal_converter.get_conversion_functions(typ) - arr = to_arrow_struct(py_val) - assert arr == [{"key": "test", "value": [1, 2]}] - - -def test_schema_as_required_strips_optional_fields(): - from orcapod.types import Schema - - s = Schema({"a": int, "b": str}, optional_fields=["b"]) - result = s.as_required() - assert result == Schema({"a": int, "b": str}) - assert result.optional_fields == frozenset() - - -def test_schema_as_required_idempotent(): - from orcapod.types import Schema - - s = Schema({"a": int, "b": str}, optional_fields=["a", "b"]) - once = s.as_required() - twice = s.as_required().as_required() - assert once == twice - - -def test_python_schema_to_arrow_non_nullable(): - """Plain types (no | None) must produce nullable=False Arrow fields.""" - from orcapod.types import Schema - - ctx = get_default_context() - schema = ctx.type_converter.python_schema_to_arrow_schema( - Schema({"a": int, "b": str, "c": float, "d": bool, "e": bytes}) - ) - for name in ("a", "b", "c", "d", "e"): - assert schema.field(name).nullable is False, ( - f"Field '{name}' should be nullable=False for a plain type" - ) - - -def test_python_schema_to_arrow_optional_nullable(): - """Optional types (T | None) must produce nullable=True Arrow fields.""" - from orcapod.types import Schema - - ctx = get_default_context() - schema = ctx.type_converter.python_schema_to_arrow_schema( - Schema({"x": int | None, "y": str | None}) - ) - assert schema.field("x").nullable is True - assert schema.field("y").nullable is True - - -def test_arrow_schema_to_python_nullable_becomes_optional(): - """nullable=True Arrow fields must reconstruct as T | None.""" - ctx = get_default_context() - arrow_schema = pa.schema([pa.field("x", pa.int64(), nullable=True)]) - python_schema = ctx.type_converter.arrow_schema_to_python_schema(arrow_schema) - assert python_schema["x"] == int | None - - -def test_arrow_schema_to_python_non_nullable_stays_plain(): - """nullable=False Arrow fields must reconstruct as plain T.""" - ctx = get_default_context() - arrow_schema = pa.schema([pa.field("x", pa.int64(), nullable=False)]) - python_schema = ctx.type_converter.arrow_schema_to_python_schema(arrow_schema) - assert python_schema["x"] == int - - -def test_round_trip_preserves_optionality(): - """Python schema → Arrow → Python schema is lossless for nullable/non-nullable.""" - from orcapod.types import Schema - - ctx = get_default_context() - original = Schema({"required": int, "nullable_field": int | None}) - arrow = ctx.type_converter.python_schema_to_arrow_schema(original) - recovered = ctx.type_converter.arrow_schema_to_python_schema(arrow) - - assert recovered["required"] == int - assert recovered["nullable_field"] == int | None - assert recovered == original - - -# --------------------------------------------------------------------------- -# ENG-389: Any <-> pa.null() round-trip -# --------------------------------------------------------------------------- - - -def test_any_to_arrow_type(): - """typing.Any maps to pa.null().""" - assert universal_converter.python_type_to_arrow_type(Any) == pa.null() - - -def test_list_any_to_arrow_type(): - """list[Any] maps to pa.large_list(pa.null()).""" - assert ( - universal_converter.python_type_to_arrow_type(list[Any]) - == pa.large_list(pa.null()) - ) - - -def test_dict_any_any_to_arrow_type(): - """dict[Any, Any] maps to pa.large_list(pa.struct([("key", pa.null()), ("value", pa.null())])).""" - expected = pa.large_list( - pa.struct([("key", pa.null()), ("value", pa.null())]) - ) - assert universal_converter.python_type_to_arrow_type(dict[Any, Any]) == expected - - -def test_null_arrow_to_any_python_type(): - """pa.null() maps back to typing.Any.""" - assert universal_converter.arrow_type_to_python_type(pa.null()) is Any - - -def test_list_any_round_trip(): - """list[Any] round-trips: list[Any] -> pa.large_list(pa.null()) -> list[Any].""" - arrow_type = universal_converter.python_type_to_arrow_type(list[Any]) - assert universal_converter.arrow_type_to_python_type(arrow_type) == list[Any] - - -def test_dict_any_any_round_trip(): - """dict[Any, Any] round-trips through Arrow and back to dict[Any, Any].""" - arrow_type = universal_converter.python_type_to_arrow_type(dict[Any, Any]) - assert universal_converter.arrow_type_to_python_type(arrow_type) == dict[Any, Any] - - -def test_empty_container_inference_to_arrow_no_error(): - """Inferring schema from empty containers and converting to Arrow does not raise.""" - from orcapod.semantic_types.pydata_utils import infer_python_schema_from_pylist_data - from orcapod.semantic_types.universal_converter import UniversalTypeConverter - - schema = infer_python_schema_from_pylist_data([{"items": [], "meta": {}}]) - converter = UniversalTypeConverter() - # Must not raise ValueError: Unsupported Python type: typing.Any - arrow_schema = converter.python_schema_to_arrow_schema(schema) - assert "items" in [f.name for f in arrow_schema] - assert "meta" in [f.name for f in arrow_schema] - - -def test_pyarrow_empty_list_with_null_type(): - """PyArrow accepts empty lists for pa.large_list(pa.null()) and pa.large_list(pa.struct(...)) columns.""" - schema = pa.schema([ - pa.field("items", pa.large_list(pa.null())), - pa.field("meta", pa.large_list(pa.struct([("key", pa.null()), ("value", pa.null())]))), - ]) - table = pa.Table.from_pylist([{"items": [], "meta": []}], schema=schema) - assert table.num_rows == 1 - assert table.schema.field("items").type == pa.large_list(pa.null()) - - -# ── LogicalTypeRegistry priority tests ─────────────────────────────────────── - - -def _make_logical_type_stub(py_type: type, arrow_name: str): - """Return a minimal LogicalTypeProtocol conforming stub.""" - _ArrowExtClass = make_arrow_extension_type(arrow_name, pa.large_string()) - - class _PolarsExt(pl.BaseExtension): - def __init__(self): - super().__init__(arrow_name, pl.String, None) - @classmethod - def ext_from_params(cls, ext_name, storage_dtype, metadata_str): - return cls() - - class _Stub: - logical_type_name = arrow_name - python_type = py_type - - def get_arrow_extension_type(self): - return _ArrowExtClass() - - def get_polars_extension_type(self): - return _PolarsExt() - - def python_to_storage(self, value): - return str(value) - - def storage_to_python(self, storage_value): - return storage_value - - return _Stub() - - -class _MyCustomClass: - pass - - -def test_converter_uses_logical_type_registry_for_registered_type(): - """When a LogicalType is registered, converter returns its Arrow extension type.""" - arrow_name = f"test.MyCustomClass.{_uuid_module.uuid4().hex[:8]}" - lt = _make_logical_type_stub(_MyCustomClass, arrow_name) - - registry = LogicalTypeRegistry() - registry.register_logical_type(lt) - - converter = UniversalTypeConverter(logical_type_registry=registry) - - result = converter.python_type_to_arrow_type(_MyCustomClass) - expected_ext = lt.get_arrow_extension_type() - assert result == expected_ext - - -def test_converter_falls_through_for_unregistered_type(): - """If type not in LogicalTypeRegistry, converter falls through to old system (int → int64).""" - registry = LogicalTypeRegistry() - converter = UniversalTypeConverter(logical_type_registry=registry) - - result = converter.python_type_to_arrow_type(int) - assert result == pa.int64() - - -def test_converter_without_registry_unchanged(): - """With no logical_type_registry, converter behaves exactly as before.""" - converter = UniversalTypeConverter() - assert converter.python_type_to_arrow_type(str) == pa.large_string() - - -def test_data_context_type_converter_holds_logical_type_registry(): - """DataContext's type_converter has a non-None _logical_type_registry.""" - from orcapod.contexts import get_default_context - ctx = get_default_context() - assert hasattr(ctx.type_converter, "_logical_type_registry") - assert ctx.type_converter._logical_type_registry is not None - - -# ── Helpers for new tests ──────────────────────────────────────────────────── - -import dataclasses -import pathlib -from typing import Optional - -from orcapod.extension_types.registry import make_polars_extension_type - - -def _make_registry_with_builtins() -> LogicalTypeRegistry: - """Registry with LogicalPath, LogicalUUID, LogicalUPath pre-registered.""" - from orcapod.extension_types.builtin_logical_types import LogicalPath, LogicalUUID, LogicalUPath - return LogicalTypeRegistry(logical_types=[LogicalPath(), LogicalUUID(), LogicalUPath()]) - - -def _make_converter(registry: LogicalTypeRegistry | None = None) -> UniversalTypeConverter: - if registry is None: - registry = _make_registry_with_builtins() - return UniversalTypeConverter(logical_type_registry=registry) - - -# ── register_python_class tests ────────────────────────────────────────────── - -def test_register_python_class_primitive_int(): - converter = _make_converter() - assert converter.register_python_class(int) == pa.int64() - - -def test_register_python_class_primitive_str(): - converter = _make_converter() - assert converter.register_python_class(str) == pa.large_string() - - -def test_register_python_class_list_of_int(): - converter = _make_converter() - result = converter.register_python_class(list[int]) - assert result == pa.large_list(pa.int64()) - - -def test_register_python_class_optional_str(): - converter = _make_converter() - result = converter.register_python_class(Optional[str]) - assert result == pa.large_string() - - -def test_register_python_class_dict_str_int(): - converter = _make_converter() - result = converter.register_python_class(dict[str, int]) - expected = pa.large_list(pa.struct([pa.field("key", pa.large_string()), pa.field("value", pa.int64())])) - assert result == expected - - -def test_register_python_class_set_of_str(): - converter = _make_converter() - result = converter.register_python_class(set[str]) - assert result == pa.large_list(pa.large_string()) - - -def test_register_python_class_registry_hit_path(): - """pathlib.Path is pre-registered → returns the orcapod.path extension type.""" - converter = _make_converter() - result = converter.register_python_class(pathlib.Path) - assert isinstance(result, pa.ExtensionType) - assert result.extension_name == "orcapod.path" - - -def test_register_python_class_uuid_registry_hit(): - converter = _make_converter() - result = converter.register_python_class(_uuid_module.UUID) - assert isinstance(result, pa.ExtensionType) - assert result.extension_name == "orcapod.uuid" - - -def test_register_python_class_factory_dispatch(): - """A custom class triggers factory synthesis and caches the result.""" - import uuid as _u - - class _Base: - pass - - class _Child(_Base): - pass - - ext_name = f"test.custom.{_u.uuid4().hex[:8]}" - ArrowExt = make_arrow_extension_type(ext_name, pa.large_string()) - PolarsExt = make_polars_extension_type(ext_name, pa.large_string()) - synthesized_calls = [] - - class _Factory: - def supports_class(self, python_type): - return issubclass(python_type, _Base) - def create_for_python_type(self, python_type, converter): - synthesized_calls.append(python_type) - class _LT: - logical_type_name = ext_name - python_type_ = _Child - python_type = _Child - def get_arrow_extension_type(self): return ArrowExt() - def get_polars_extension_type(self): return PolarsExt() - def python_to_storage(self, v, c=None): return str(v) - def storage_to_python(self, v, c=None): return v - return _LT() - def reconstruct_from_arrow(self, name, storage, meta, converter): pass - - registry = _make_registry_with_builtins() - registry.register_logical_type_factory(_Factory(), python_bases=[_Base]) - converter = _make_converter(registry) - - result = converter.register_python_class(_Child) - assert isinstance(result, pa.ExtensionType) - assert result.extension_name == ext_name - assert _Child in synthesized_calls - - # Second call is a registry hit — factory NOT called again - result2 = converter.register_python_class(_Child) - assert result2 == result - assert len(synthesized_calls) == 1 - - -def test_register_python_class_cycle_detection(): - """Cyclic type synthesis raises TypeError.""" - - class _CycleClass: - pass - - class _CycleFactory: - def supports_class(self, python_type): - return python_type is _CycleClass - def create_for_python_type(self, python_type, converter): - # Intentionally trigger a cycle - converter.register_python_class(_CycleClass) - def reconstruct_from_arrow(self, name, storage, meta, converter): pass - - registry = _make_registry_with_builtins() - registry.register_logical_type_factory(_CycleFactory(), python_bases=[_CycleClass]) - converter = _make_converter(registry) - - with pytest.raises(TypeError, match="[Cc]ircular"): - converter.register_python_class(_CycleClass) - - -def test_register_python_class_list_of_uuid_raises(): - """list[UUID] raises ValueError: UUID is a logical type and cannot be preserved - inside a list value field (ET2 in DESIGN_ISSUES.md). Tracked in PLT-1732.""" - converter = _make_converter() - with pytest.raises(ValueError, match="PLT-1732"): - converter.register_python_class(list[_uuid_module.UUID]) - - -def test_register_python_class_dict_str_uuid_raises(): - """dict[str, UUID] raises ValueError: UUID is a logical type and cannot be preserved - inside a struct field (ET1/ET2 in DESIGN_ISSUES.md). Tracked in PLT-1732.""" - converter = _make_converter() - with pytest.raises(ValueError, match="PLT-1732"): - converter.register_python_class(dict[str, _uuid_module.UUID]) - - -# ── register_storage_type tests ────────────────────────────────────────────── - -def test_register_storage_type_primitive_int(): - converter = _make_converter() - assert converter.register_storage_type(pa.int64()) == pa.int64() - - -def test_register_storage_type_primitive_large_string(): - converter = _make_converter() - assert converter.register_storage_type(pa.large_string()) == pa.large_string() - - -def test_register_storage_type_extension_type_registry_hit(): - """An already-registered extension type is returned unchanged (no-op).""" - converter = _make_converter() - from orcapod.extension_types.builtin_logical_types import LogicalUUID - uuid_ext = LogicalUUID().get_arrow_extension_type() - result = converter.register_storage_type(uuid_ext) - assert isinstance(result, pa.ExtensionType) - assert result.extension_name == "orcapod.uuid" - - -def test_register_storage_type_struct_recurses(): - """Structs are traversed field by field; resolved field types are returned.""" - converter = _make_converter() - struct_type = pa.struct([pa.field("name", pa.large_string()), pa.field("count", pa.int64())]) - result = converter.register_storage_type(struct_type) - assert pa.types.is_struct(result) - assert result.field("name").type == pa.large_string() - assert result.field("count").type == pa.int64() - - -def test_register_storage_type_large_list_recurses(): - converter = _make_converter() - list_type = pa.large_list(pa.int32()) - result = converter.register_storage_type(list_type) - assert pa.types.is_large_list(result) - assert result.value_type == pa.int32() - - -def test_register_storage_type_extension_miss_dispatches_to_factory(): - """An unregistered extension type triggers factory.reconstruct_from_arrow.""" - import json - import uuid as _u - - ext_name = f"test.reconstruct.{_u.uuid4().hex[:8]}" - category = "test.reconstruct" - metadata = json.dumps({"category": category}).encode() - ArrowExt = make_arrow_extension_type(ext_name, pa.large_string(), metadata=metadata) - PolarsExt = make_polars_extension_type(ext_name, pa.large_string()) - - class _LT: - logical_type_name = ext_name - python_type = str - def get_arrow_extension_type(self): return ArrowExt() - def get_polars_extension_type(self): return PolarsExt() - def python_to_storage(self, v, c=None): return str(v) - def storage_to_python(self, v, c=None): return v - - class _Factory: - def supports_class(self, t): return False - def create_for_python_type(self, t, converter): pass - def reconstruct_from_arrow(self, name, storage_type, meta, converter): - return _LT() - - registry = _make_registry_with_builtins() - registry.register_logical_type_factory(_Factory(), category=category) - converter = _make_converter(registry) - - ext_instance = ArrowExt() - result = converter.register_storage_type(ext_instance) - assert isinstance(result, pa.ExtensionType) - assert result.extension_name == ext_name - - # Second call: registry hit → same result, factory NOT called again - result2 = converter.register_storage_type(ext_instance) - assert result2.extension_name == ext_name - - -def test_register_storage_type_nested_struct_with_extension(): - """Extension type nested inside a struct field is stripped to storage type (ET1).""" - import json - import uuid as _u - - ext_name = f"test.nested.{_u.uuid4().hex[:8]}" - category = "test.nested" - metadata = json.dumps({"category": category}).encode() - ArrowExt = make_arrow_extension_type(ext_name, pa.large_string(), metadata=metadata) - PolarsExt = make_polars_extension_type(ext_name, pa.large_string()) - - class _LT: - logical_type_name = ext_name - python_type = str - def get_arrow_extension_type(self): return ArrowExt() - def get_polars_extension_type(self): return PolarsExt() - def python_to_storage(self, v, c=None): return str(v) - def storage_to_python(self, v, c=None): return v - - class _Factory: - def supports_class(self, t): return False - def create_for_python_type(self, t, converter): pass - def reconstruct_from_arrow(self, name, storage_type, meta, converter): - return _LT() - - registry = _make_registry_with_builtins() - registry.register_logical_type_factory(_Factory(), category=category) - converter = _make_converter(registry) - - ext_instance = ArrowExt() - struct_with_ext = pa.struct([pa.field("id", pa.int64()), pa.field("tag", ext_instance)]) - result = converter.register_storage_type(struct_with_ext) - - assert pa.types.is_struct(result) - assert result.field("id").type == pa.int64() - # Storage-safe: extension type inside struct field is stripped to its storage type - assert result.field("tag").type == pa.large_string() - assert not isinstance(result.field("tag").type, pa.ExtensionType) - # Side effect: the extension type IS registered (check via registry) - assert converter._logical_type_registry.get_by_arrow_extension_name(ext_name) is not None - - -# ── python_to_storage / storage_to_python / pass-through tests ─────────────── - -def test_python_to_storage_for_registered_type(): - """python_to_storage uses the logical type's converter for registered types.""" - converter = _make_converter() - result = converter.python_to_storage(pathlib.Path("/tmp/bar"), pathlib.Path) - assert result == "/tmp/bar" - - -def test_storage_to_python_for_registered_type(): - converter = _make_converter() - result = converter.storage_to_python("/tmp/bar", pathlib.Path) - assert isinstance(result, pathlib.Path) - assert result == pathlib.Path("/tmp/bar") - - -def test_python_to_storage_for_int(): - converter = _make_converter() - assert converter.python_to_storage(42, int) == 42 - - -def test_register_logical_type_passthrough(): - from orcapod.extension_types.builtin_logical_types import LogicalPath - registry = LogicalTypeRegistry() - converter = UniversalTypeConverter(logical_type_registry=registry) - lt = LogicalPath() - converter.register_logical_type(lt) - assert registry.get_by_python_type(pathlib.Path) is lt - - -def test_register_logical_type_factory_passthrough(): - class _Factory: - def supports_class(self, t): return False - def create_for_python_type(self, t, converter): pass - def reconstruct_from_arrow(self, name, storage, meta, converter): pass - - registry = LogicalTypeRegistry() - converter = UniversalTypeConverter(logical_type_registry=registry) - factory = _Factory() - converter.register_logical_type_factory(factory, category="test.cat") - assert registry._category_factories.get("test.cat") is factory diff --git a/tests/test_semantic_types/test_upath_struct_converter.py b/tests/test_semantic_types/test_upath_struct_converter.py deleted file mode 100644 index ccfe014f..00000000 --- a/tests/test_semantic_types/test_upath_struct_converter.py +++ /dev/null @@ -1,148 +0,0 @@ -from pathlib import Path -from typing import cast - -import pytest -from upath import UPath - -from orcapod.hashing.file_hashers import BasicFileHasher -from orcapod.semantic_types.semantic_struct_converters import UPathStructConverter - - -@pytest.fixture -def file_hasher(): - return BasicFileHasher(algorithm="sha256") - - -@pytest.fixture -def converter(file_hasher): - return UPathStructConverter(file_hasher=file_hasher) - - -def test_upath_to_struct_and_back(converter): - path_obj = UPath("/tmp/test.txt") - struct_dict = converter.python_to_struct_dict(path_obj) - assert struct_dict["upath"] == str(path_obj) - restored = converter.struct_dict_to_python(struct_dict) - assert isinstance(restored, UPath) - assert str(restored) == str(path_obj) - - -def test_upath_to_struct_invalid_type(converter): - with pytest.raises(TypeError): - converter.python_to_struct_dict(Path("/tmp/test.txt")) # type: ignore - - -def test_struct_to_python_missing_field(converter): - with pytest.raises(ValueError): - converter.struct_dict_to_python({}) - - -def test_can_handle_python_type(converter): - assert converter.can_handle_python_type(UPath) - assert not converter.can_handle_python_type(str) - assert not converter.can_handle_python_type(Path) - - -def test_can_handle_struct_type(converter): - struct_type = converter.arrow_struct_type - assert converter.can_handle_struct_type(struct_type) - - -def test_is_semantic_struct(converter): - assert converter.is_semantic_struct({"upath": "/tmp/test.txt"}) - assert not converter.is_semantic_struct({"path": "/tmp/test.txt"}) - assert not converter.is_semantic_struct({"upath": 123}) - - -def test_hash_struct_dict_file_not_found(converter, tmp_path): - struct_dict = {"upath": str(tmp_path / "does_not_exist.txt")} - with pytest.raises(FileNotFoundError): - converter.hash_struct_dict(struct_dict) - - -def test_hash_struct_dict_is_directory(converter, tmp_path): - struct_dict = {"upath": str(tmp_path)} - with pytest.raises(IsADirectoryError): - converter.hash_struct_dict(struct_dict) - - -def test_hash_struct_dict_content_based(converter, tmp_path): - """Two distinct files with identical content produce the same hash.""" - file1 = tmp_path / "file1.txt" - file2 = tmp_path / "file2.txt" - content = "identical content" - file1.write_text(content) - file2.write_text(content) - hash1 = converter.hash_struct_dict({"upath": str(file1)}) - hash2 = converter.hash_struct_dict({"upath": str(file2)}) - assert hash1 == hash2 - - -def test_hash_struct_dict_with_prefix(converter, tmp_path): - """Hash always starts with 'upath:sha256:'.""" - file = tmp_path / "file.txt" - file.write_text("hello") - hash_str = converter.hash_struct_dict({"upath": str(file)}) - assert hash_str.startswith("upath:sha256:") - - -def test_hash_struct_dict_different_content(converter, tmp_path): - """Same path with modified content yields a different hash.""" - file = tmp_path / "mutable.txt" - file.write_text("version 1") - hash1 = converter.hash_struct_dict({"upath": str(file)}) - file.write_text("version 2") - hash2 = converter.hash_struct_dict({"upath": str(file)}) - assert hash1 != hash2 - - -def test_hash_struct_dict_missing_field(converter): - with pytest.raises(ValueError, match="Missing 'upath' field"): - converter.hash_struct_dict({}) - - -def test_upath_arrow_struct_type(converter): - """The Arrow struct type has a single 'upath' field of large_string.""" - import pyarrow as pa - - struct_type = converter.arrow_struct_type - assert isinstance(struct_type, pa.StructType) - assert len(struct_type) == 1 - assert struct_type[0].name == "upath" - assert struct_type[0].type == pa.large_string() - - -def test_path_and_upath_struct_types_differ(): - """Path and UPath converters produce distinct Arrow struct types.""" - from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter - - file_hasher = BasicFileHasher(algorithm="sha256") - path_conv = PythonPathStructConverter(file_hasher=file_hasher) - upath_conv = UPathStructConverter(file_hasher=file_hasher) - - assert path_conv.arrow_struct_type != upath_conv.arrow_struct_type - assert path_conv.arrow_struct_type[0].name == "path" - assert upath_conv.arrow_struct_type[0].name == "upath" - - -def test_path_converter_rejects_upath(): - """PythonPathStructConverter rejects UPath instances to avoid ambiguity.""" - from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter - - file_hasher = BasicFileHasher(algorithm="sha256") - path_conv = PythonPathStructConverter(file_hasher=file_hasher) - - upath_val = UPath("/tmp/test.txt") - with pytest.raises(TypeError, match="not UPath"): - path_conv.python_to_struct_dict(upath_val) - - -def test_path_converter_cannot_handle_upath_type(): - """PythonPathStructConverter.can_handle_python_type returns False for UPath.""" - from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter - - file_hasher = BasicFileHasher(algorithm="sha256") - path_conv = PythonPathStructConverter(file_hasher=file_hasher) - - assert not path_conv.can_handle_python_type(UPath) - assert path_conv.can_handle_python_type(Path) diff --git a/tests/test_semantic_types/test_uuid_struct_converter.py b/tests/test_semantic_types/test_uuid_struct_converter.py deleted file mode 100644 index c8084991..00000000 --- a/tests/test_semantic_types/test_uuid_struct_converter.py +++ /dev/null @@ -1,134 +0,0 @@ -"""Tests for UUIDStructConverter.""" -import uuid - -import pyarrow as pa -import pytest - -from orcapod.semantic_types.semantic_struct_converters import UUIDStructConverter - - -@pytest.fixture -def converter(): - return UUIDStructConverter() - - -@pytest.fixture -def sample_uuid(): - return uuid.UUID("550e8400-e29b-41d4-a716-446655440000") - - -def test_python_type(converter): - assert converter.python_type is uuid.UUID - - -def test_arrow_struct_type(converter): - assert converter.arrow_struct_type == pa.struct([pa.field("uuid", pa.binary(16))]) - - -def test_semantic_type_name(converter): - assert converter.semantic_type_name == "uuid" - - -def test_python_to_struct_dict(converter, sample_uuid): - result = converter.python_to_struct_dict(sample_uuid) - assert result == {"uuid": sample_uuid.bytes} - assert isinstance(result["uuid"], bytes) - assert len(result["uuid"]) == 16 - - -def test_python_to_struct_dict_rejects_non_uuid(converter): - with pytest.raises(TypeError): - converter.python_to_struct_dict("550e8400-e29b-41d4-a716-446655440000") # type: ignore - - -def test_struct_dict_to_python(converter, sample_uuid): - struct_dict = {"uuid": sample_uuid.bytes} - result = converter.struct_dict_to_python(struct_dict) - assert result == sample_uuid - assert isinstance(result, uuid.UUID) - - -def test_struct_dict_to_python_from_bytearray(converter, sample_uuid): - """Arrow may return binary fields as bytearray — must handle both.""" - struct_dict = {"uuid": bytearray(sample_uuid.bytes)} - result = converter.struct_dict_to_python(struct_dict) - assert result == sample_uuid - - -def test_struct_dict_to_python_missing_field(converter): - with pytest.raises(ValueError, match="Missing 'uuid' field"): - converter.struct_dict_to_python({}) - - -def test_round_trip(converter, sample_uuid): - struct_dict = converter.python_to_struct_dict(sample_uuid) - recovered = converter.struct_dict_to_python(struct_dict) - assert recovered == sample_uuid - - -def test_round_trip_all_versions(): - """Verify round-trip works for uuid4, uuid5, and uuid7 (uuid_utils). - - ``uuid_utils.UUID`` objects do not inherit from ``uuid.UUID`` and their - ``__eq__`` does not cross-compare with ``uuid.UUID``, so we compare by - the canonical string representation instead of direct equality. - """ - from uuid_utils import uuid7 - - converter = UUIDStructConverter() - for u in [uuid.uuid4(), uuid.uuid5(uuid.NAMESPACE_OID, "test"), uuid7()]: - recovered = converter.struct_dict_to_python(converter.python_to_struct_dict(u)) - assert str(recovered) == str(u) - - -def test_arrow_array_round_trip(converter, sample_uuid): - """Verify UUID survives a PyArrow array round-trip.""" - struct_dict = converter.python_to_struct_dict(sample_uuid) - arr = pa.array([struct_dict], type=pa.struct([pa.field("uuid", pa.binary(16))])) - recovered_dict = arr[0].as_py() - recovered_uuid = converter.struct_dict_to_python(recovered_dict) - assert recovered_uuid == sample_uuid - - -def test_distinct_uuids_produce_distinct_struct_dicts(converter): - u1, u2 = uuid.uuid4(), uuid.uuid4() - assert converter.python_to_struct_dict(u1) != converter.python_to_struct_dict(u2) - - -def test_can_handle_python_type_uuid(converter): - assert converter.can_handle_python_type(uuid.UUID) is True - - -def test_can_handle_python_type_rejects_str(converter): - assert converter.can_handle_python_type(str) is False - - -def test_can_handle_struct_type_uuid(converter): - assert converter.can_handle_struct_type(pa.struct([pa.field("uuid", pa.binary(16))])) is True - - -def test_can_handle_struct_type_rejects_other(converter): - import pyarrow as pa - - assert converter.can_handle_struct_type(pa.struct([pa.field("path", pa.large_string())])) is False - - -def test_hash_struct_dict_returns_string(converter, sample_uuid): - struct_dict = converter.python_to_struct_dict(sample_uuid) - result = converter.hash_struct_dict(struct_dict) - assert isinstance(result, str) - assert len(result) > 0 - - -def test_hash_struct_dict_consistent(converter, sample_uuid): - """Same UUID always produces the same hash.""" - struct_dict = converter.python_to_struct_dict(sample_uuid) - assert converter.hash_struct_dict(struct_dict) == converter.hash_struct_dict(struct_dict) - - -def test_hash_struct_dict_different_uuids(converter): - """Different UUIDs produce different hashes.""" - u1, u2 = uuid.uuid4(), uuid.uuid4() - d1 = converter.python_to_struct_dict(u1) - d2 = converter.python_to_struct_dict(u2) - assert converter.hash_struct_dict(d1) != converter.hash_struct_dict(d2) From 80385072d00f8e3949cfdb33427e311c3e39dd96 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 19:43:24 +0000 Subject: [PATCH 20/33] fix(PLT-1660): fix broken get_default_arrow_hasher, add passthrough test, fix stale log message - get_default_arrow_hasher(): remove broken set_cacher() call and cache_file_hash param; StarfixArrowHasher has no set_cacher method. Replaced with a simple delegate to get_default_context().arrow_hasher. - semantic_hasher.py: update stale log message from SemanticHasherProtocol (non-strict) to SemanticAwarePythonHasher (non-strict) with more descriptive text. - test_extension_type_hashing.py: add test_unregistered_python_type_passes_through to TestSemanticHashingVisitorExtension covering the branch where extension type is recognized but has no semantic hasher registered. Note: Fix 2 (remove pa.Table/pa.RecordBatch from v0.1.json) was not applied because Datagram.identity_structure() explicitly depends on ArrowTableSemanticHasher being registered to hash pa.Table objects (documented in datagram.py docstring). Removing these entries breaks 1 test (test_merge_join) and the fundamental design. The lazy context resolution in ArrowTableSemanticHasher._get_arrow_hasher() already handles the circular dependency concern raised in the review. Co-Authored-By: Claude Sonnet 4.6 --- src/orcapod/hashing/defaults.py | 45 +++---------------- .../semantic_hashing/semantic_hasher.py | 2 +- .../test_extension_type_hashing.py | 23 ++++++++++ 3 files changed, 31 insertions(+), 39 deletions(-) diff --git a/src/orcapod/hashing/defaults.py b/src/orcapod/hashing/defaults.py index 0dc8b6c2..21034936 100644 --- a/src/orcapod/hashing/defaults.py +++ b/src/orcapod/hashing/defaults.py @@ -49,46 +49,15 @@ def get_default_semantic_hasher() -> hp.SemanticHasherProtocol: return get_default_context().semantic_hasher -def get_default_arrow_hasher( - cache_file_hash: bool | hp.StringCacherProtocol = True, -) -> hp.ArrowHasherProtocol: - """ - Return the ArrowHasherProtocol from the default data context. - - If ``cache_file_hash`` is True an in-memory StringCacherProtocol is attached to - the hasher so that repeated hashes of the same file path are served from - cache. Pass a ``StringCacherProtocol`` instance to use a custom caching backend - (e.g. SQLite-backed). - - Note: caching is applied on top of the context's arrow hasher each time - this function is called. If you need a single shared cached instance, - obtain it once and store it yourself. +def get_default_arrow_hasher() -> hp.ArrowHasherProtocol: + """Return the ArrowHasherProtocol from the default data context. - Args: - cache_file_hash: True to use an ephemeral in-memory cache, a - StringCacherProtocol instance to use a custom cache, or False/None to - disable caching. + Note: file-hash caching (formerly via ``set_cacher``) has been removed. + ``StarfixArrowHasher`` does not support per-path caching. Use + ``CachedFileHasher`` when constructing a custom context if caching is needed. Returns: - ArrowHasherProtocol: The arrow hasher from the default data context, - optionally with file-hash caching attached. + ArrowHasherProtocol: The arrow hasher from the default data context. """ - from typing import Any - from orcapod.contexts import get_default_context - - arrow_hasher: Any = get_default_context().arrow_hasher - - if cache_file_hash: - from orcapod.hashing.string_cachers import InMemoryCacher - - if cache_file_hash is True: - string_cacher: hp.StringCacherProtocol = InMemoryCacher(max_size=None) - else: - string_cacher = cache_file_hash - - # set_cacher is present on StarfixArrowHasher but not on the - # ArrowHasherProtocol protocol, so we call it via Any to avoid a type error. - arrow_hasher.set_cacher("path", string_cacher) - - return arrow_hasher + return get_default_context().arrow_hasher diff --git a/src/orcapod/hashing/semantic_hashing/semantic_hasher.py b/src/orcapod/hashing/semantic_hashing/semantic_hasher.py index bcc18b51..300f6987 100644 --- a/src/orcapod/hashing/semantic_hashing/semantic_hasher.py +++ b/src/orcapod/hashing/semantic_hashing/semantic_hasher.py @@ -391,7 +391,7 @@ def _handle_unknown(self, obj: Any) -> str: ) logger.warning( - "SemanticHasherProtocol (non-strict): no handler for type '%s'. " + "SemanticAwarePythonHasher (non-strict): no PythonTypeSemanticHasherProtocol registered for type '%s'. " "Falling back to best-effort string representation.", qualified, ) diff --git a/tests/test_hashing/test_extension_type_hashing.py b/tests/test_hashing/test_extension_type_hashing.py index f371ef9b..56a8d822 100644 --- a/tests/test_hashing/test_extension_type_hashing.py +++ b/tests/test_hashing/test_extension_type_hashing.py @@ -121,3 +121,26 @@ def test_null_value_passthrough(self, ctx): assert new_type == arrow_type assert new_data is None + + def test_unregistered_python_type_passes_through(self, ctx): + """Extension types with no registered semantic hasher pass through unchanged.""" + import uuid + from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeSemanticHasherRegistry + from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher + + # Build a hasher with a registry that has NO entry for UUID + empty_registry = PythonTypeSemanticHasherRegistry() + stripped_hasher = SemanticAwarePythonHasher( + hasher_id="test_v0", + type_semantic_hasher_registry=empty_registry, + ) + + arrow_type = ctx.type_converter.register_python_class(uuid.UUID) + storage_val = ctx.type_converter.python_to_storage(uuid.UUID("12345678-1234-5678-1234-567812345678"), uuid.UUID) + + visitor = SemanticHashingVisitor(ctx.type_converter, stripped_hasher) + new_type, new_data = visitor.visit(arrow_type, storage_val) + + # Should be completely unchanged since UUID has no semantic hasher + assert new_type == arrow_type + assert new_data == storage_val From d34c504d61cad3cad668b2ee353e590c2bf24b54 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 20:39:12 +0000 Subject: [PATCH 21/33] docs(PLT-1660): add implementation plan for hard-cut extension type hashing --- ...lt-1660-hard-cut-extension-type-hashing.md | 2466 +++++++++++++++++ 1 file changed, 2466 insertions(+) create mode 100644 superpowers/plans/2026-06-24-plt-1660-hard-cut-extension-type-hashing.md diff --git a/superpowers/plans/2026-06-24-plt-1660-hard-cut-extension-type-hashing.md b/superpowers/plans/2026-06-24-plt-1660-hard-cut-extension-type-hashing.md new file mode 100644 index 00000000..a4642fdb --- /dev/null +++ b/superpowers/plans/2026-06-24-plt-1660-hard-cut-extension-type-hashing.md @@ -0,0 +1,2466 @@ +# PLT-1660: Hard Cut Extension Type Hashing — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use sensei:subagent-driven-development (recommended) or sensei:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Delete the old shape-based `SemanticTypeRegistry` system, wire the new extension-type system into Arrow hashing, and rename all protocol/registry/handler classes to cleaner names. + +**Architecture:** `ArrowTypeDataVisitor` gains a `visit_extension()` hook (default: passthrough). `SemanticHashingVisitor` overrides it: for extension types whose Python counterpart has a registered semantic hasher, it converts the value to a Python object, hashes it, and stores the result as `pa.large_binary()` in the format `:::`. Unrecognized extension types pass through unmodified — starfix still sees their full metadata. All `TypeHandlerProtocol.handle()->Any` handlers are tightened to `PythonTypeSemanticHasherProtocol.hash()->ContentHash`. + +**Tech Stack:** Python 3.10+, PyArrow extension types, starfix-python, uv/pytest + +--- + +## File Map + +**Modified source:** +- `src/orcapod/protocols/hashing_protocols.py` — rename `TypeHandlerProtocol`→`PythonTypeSemanticHasherProtocol`, `handle()`→`hash()->ContentHash`; rename `type_handler_registry`→`type_semantic_hasher_registry` on `SemanticHasherProtocol` +- `src/orcapod/hashing/semantic_hashing/type_handler_registry.py` — rename class + all methods +- `src/orcapod/hashing/semantic_hashing/builtin_handlers.py` — rename 11 handler classes; `handle()`→`hash()->ContentHash`; rename `register_builtin_handlers` +- `src/orcapod/hashing/semantic_hashing/semantic_hasher.py` — rename `BaseSemanticHasher`→`SemanticAwarePythonHasher`; simplify dispatch; rename property +- `src/orcapod/hashing/semantic_hashing/content_identifiable_mixin.py` — update import + type annotations +- `src/orcapod/hashing/semantic_hashing/__init__.py` — update exports +- `src/orcapod/hashing/__init__.py` — update exports +- `src/orcapod/hashing/defaults.py` — rename function; update property access; remove broken `set_cacher` call +- `src/orcapod/hashing/visitors.py` — add `visit_extension` to base class + rewrite `SemanticHashingVisitor` +- `src/orcapod/hashing/arrow_hashers.py` — update `StarfixArrowHasher` constructor + short-circuit; delete `SemanticArrowHasher` +- `src/orcapod/hashing/versioned_hashers.py` — source `StarfixArrowHasher` from context; rename imports +- `src/orcapod/contexts/data/v0.1.json` — reorder components; remove `semantic_registry`; update class names and refs; add `type_converter`+`semantic_hasher` to `arrow_hasher`; remove `pa.Table` handlers (cycle-break) +- `src/orcapod/contexts/data/schemas/context_schema.json` — remove `semantic_registry` property; rename `type_handler_registry`→`python_type_semantic_hasher_registry` +- `src/orcapod/contexts/core.py` — update docstring for renamed property +- `src/orcapod/semantic_types/__init__.py` — remove `SemanticTypeRegistry` export +- `src/orcapod/protocols/semantic_types_protocols.py` — delete `SemanticStructConverterProtocol` + +**Deleted source:** +- `src/orcapod/semantic_types/semantic_struct_converters.py` +- `src/orcapod/semantic_types/semantic_registry.py` + +**Deleted tests:** +- `tests/test_semantic_types/` (all 9 files) +- `tests/test_hashing/test_file_hashing_consistency.py` + +**New tests:** +- `tests/test_hashing/test_extension_type_hashing.py` + +**Updated tests:** +- `tests/test_hashing/test_semantic_hasher.py` +- `tests/test_hashing/test_starfix_arrow_hasher.py` + +--- + +## Task 1: Rename `TypeHandlerProtocol` → `PythonTypeSemanticHasherProtocol` + +**Files:** +- Modify: `src/orcapod/protocols/hashing_protocols.py` + +- [ ] **Step 1: Rewrite the protocol class and update surrounding references** + +Replace the entire `TypeHandlerProtocol` class and update the `SemanticHasherProtocol`'s `type_handler_registry` property: + +```python +# In src/orcapod/protocols/hashing_protocols.py + +# Update TYPE_CHECKING import: +if TYPE_CHECKING: + import pyarrow as pa + from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeSemanticHasherRegistry + from orcapod.types import ContentHash # already imported at module level, just noting + +# Replace TypeHandlerProtocol with: +class PythonTypeSemanticHasherProtocol(Protocol): + """Protocol for type-specific semantic hashers used by SemanticAwarePythonHasher. + + A PythonTypeSemanticHasherProtocol hashes a specific Python type to a ``ContentHash``. + Implementations are registered with a ``PythonTypeSemanticHasherRegistry`` and looked + up via MRO-aware resolution. + + Each implementation receives the full ``SemanticAwarePythonHasher`` so it can delegate + hashing of sub-values (e.g. hashing a dict of function metadata) back to the outer + hasher without coupling to a specific hasher instance. + """ + + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + """Hash *obj* to a ContentHash. + + Args: + obj: The object to hash. Always matches the registered type. + hasher: The active ``SemanticAwarePythonHasher``. Use + ``hasher.hash_object(sub_value)`` to hash sub-values. + + Returns: + ContentHash: The content-addressed hash of *obj*. + """ + ... + + +# Update SemanticHasherProtocol — rename the property: +class SemanticHasherProtocol(Protocol): + # ... existing methods unchanged ... + + @property + def type_semantic_hasher_registry(self) -> "PythonTypeSemanticHasherRegistry": + """Return the PythonTypeSemanticHasherRegistry used by this hasher.""" + ... +``` + +The full updated `hashing_protocols.py` (only `TypeHandlerProtocol` is renamed and `SemanticHasherProtocol.type_handler_registry` → `type_semantic_hasher_registry`; everything else is unchanged): + +```python +"""Hash strategy protocols for dependency injection.""" + +from __future__ import annotations + +from collections.abc import Callable +from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable + +from orcapod.types import ContentHash, PathLike, Schema + +if TYPE_CHECKING: + import pyarrow as pa + from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeSemanticHasherRegistry + + +@runtime_checkable +class DataContextAwareProtocol(Protocol): + """Protocol for objects aware of their data context.""" + + @property + def data_context_key(self) -> str: + """Return the data context key associated with this object.""" + ... + + +@runtime_checkable +class PipelineElementProtocol(Protocol): + """Protocol for objects that have a stable identity as an element in a pipeline graph.""" + + def pipeline_identity_structure(self) -> Any: + """Return a structure representing this element's pipeline identity.""" + ... + + def pipeline_hash(self, hasher=None) -> ContentHash: + """Return the pipeline-level hash of this element.""" + ... + + +@runtime_checkable +class ContentIdentifiableProtocol(Protocol): + """Protocol for objects that can express their semantic identity as a plain Python structure.""" + + def identity_structure(self) -> Any: + """Return a structure that represents the semantic identity of this object.""" + ... + + def content_hash(self, hasher: "SemanticHasherProtocol | None" = None) -> ContentHash: + """Returns the content hash.""" + ... + + +class PythonTypeSemanticHasherProtocol(Protocol): + """Protocol for type-specific semantic hashers used by SemanticAwarePythonHasher. + + A ``PythonTypeSemanticHasherProtocol`` hashes a specific Python type to a + ``ContentHash``. Implementations are registered with a + ``PythonTypeSemanticHasherRegistry`` and looked up via MRO-aware resolution. + + Each implementation receives the full ``SemanticAwarePythonHasher`` so it can + delegate hashing of sub-values back to the outer hasher without coupling to a + specific hasher instance. + """ + + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + """Hash *obj* to a ContentHash. + + Args: + obj: The object to hash. Always matches the registered type. + hasher: The active ``SemanticAwarePythonHasher``. Use + ``hasher.hash_object(sub_value)`` to hash sub-values. + + Returns: + ContentHash: The content-addressed hash of *obj*. + """ + ... + + +class SemanticHasherProtocol(Protocol): + """Protocol for the semantic content-based hasher.""" + + def hash_object( + self, + obj: Any, + resolver: Callable[[Any], ContentHash] | None = None, + ) -> ContentHash: + """Hash *obj* based on its semantic content.""" + ... + + @property + def hasher_id(self) -> str: + """Returns a unique identifier/name for this hasher instance.""" + ... + + @property + def type_semantic_hasher_registry(self) -> "PythonTypeSemanticHasherRegistry": + """Return the PythonTypeSemanticHasherRegistry used by this hasher.""" + ... + + +class FileContentHasherProtocol(Protocol): + """Protocol for file-related hashing.""" + + def hash_file(self, file_path: PathLike) -> ContentHash: ... + + +@runtime_checkable +class ArrowHasherProtocol(Protocol): + """Protocol for hashing arrow data.""" + + @property + def hasher_id(self) -> str: ... + + def hash_table(self, table: "pa.Table | pa.RecordBatch") -> ContentHash: ... + + +class StringCacherProtocol(Protocol): + """Protocol for caching string key value pairs.""" + + def get_cached(self, cache_key: str) -> str | None: ... + def set_cached(self, cache_key: str, value: str) -> None: ... + def clear_cache(self) -> None: ... + + +class FunctionInfoExtractorProtocol(Protocol): + """Protocol for extracting function information.""" + + def extract_function_info( + self, + func: Callable[..., Any], + function_name: str | None = None, + input_typespec: Schema | None = None, + output_typespec: Schema | None = None, + exclude_function_signature: bool = False, + exclude_function_body: bool = False, + ) -> dict[str, Any]: ... + + +class SemanticTypeHasherProtocol(Protocol): + """Abstract base class for semantic type-specific hashers.""" + + @property + def hasher_id(self) -> str: + """Unique identifier for this semantic type hasher.""" + ... + + def hash_column(self, column: "pa.Array") -> "pa.Array": + """Hash a column with this semantic type and return the hash bytes as an array.""" + ... + + def set_cacher(self, cacher: StringCacherProtocol) -> None: + """Add a string cacher for caching hash values.""" + ... +``` + +- [ ] **Step 2: Commit** + +```bash +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + add src/orcapod/protocols/hashing_protocols.py +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + commit -m "refactor(hashing_protocols): rename TypeHandlerProtocol → PythonTypeSemanticHasherProtocol, tighten hash() → ContentHash" +``` + +--- + +## Task 2: Rename `TypeHandlerRegistry` → `PythonTypeSemanticHasherRegistry` + +**Files:** +- Modify: `src/orcapod/hashing/semantic_hashing/type_handler_registry.py` + +- [ ] **Step 1: Rename the class, subclass, and all methods** + +Write the complete new file: + +```python +""" +PythonTypeSemanticHasherRegistry — MRO-aware registry for PythonTypeSemanticHasherProtocol instances. +""" + +from __future__ import annotations + +import logging +import threading +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from orcapod.protocols.hashing_protocols import ( + ArrowHasherProtocol, + PythonTypeSemanticHasherProtocol, + ) + +logger = logging.getLogger(__name__) + + +class PythonTypeSemanticHasherRegistry: + """Registry mapping Python types to PythonTypeSemanticHasherProtocol instances. + + Lookup is MRO-aware: when no hasher is registered for the exact type of + an object, the registry walks the object's MRO (most-derived first) until + it finds a match. + + Thread safety + ------------- + Registration and lookup are protected by a reentrant lock so that the + global singleton can be safely used from multiple threads. + """ + + def __init__( + self, handlers: list[tuple[type, "PythonTypeSemanticHasherProtocol"]] | None = None + ) -> None: + """ + Args: + handlers: Optional list of ``(target_type, hasher)`` pairs to + register at construction time. + """ + self._handlers: dict[type, "PythonTypeSemanticHasherProtocol"] = {} + self._lock = threading.RLock() + if handlers: + for target_type, handler in handlers: + self.register(target_type, handler) + + def register(self, target_type: type, handler: "PythonTypeSemanticHasherProtocol") -> None: + """Register a hasher for a specific Python type. + + If a hasher is already registered for *target_type*, it is silently + replaced by the new hasher. + + Args: + target_type: The Python type (or class) for which the hasher should be used. + handler: A ``PythonTypeSemanticHasherProtocol`` instance. + + Raises: + TypeError: If ``target_type`` is not a ``type``. + """ + if not isinstance(target_type, type): + raise TypeError( + f"target_type must be a type/class, got {type(target_type)!r}" + ) + with self._lock: + existing = self._handlers.get(target_type) + if existing is not None and existing is not handler: + logger.debug( + "PythonTypeSemanticHasherRegistry: replacing existing hasher for %s (%s -> %s)", + target_type.__name__, + type(existing).__name__, + type(handler).__name__, + ) + self._handlers[target_type] = handler + + def unregister(self, target_type: type) -> bool: + """Remove the hasher registered for *target_type*, if any. + + Args: + target_type: The type whose hasher should be removed. + + Returns: + True if a hasher was removed, False if none was registered. + """ + with self._lock: + if target_type in self._handlers: + del self._handlers[target_type] + return True + return False + + def get_semantic_hasher(self, obj: Any) -> "PythonTypeSemanticHasherProtocol | None": + """Look up the hasher for *obj* using MRO-aware resolution. + + Args: + obj: The object for which a hasher is needed. + + Returns: + The registered ``PythonTypeSemanticHasherProtocol``, or None. + """ + obj_type = type(obj) + with self._lock: + handler = self._handlers.get(obj_type) + if handler is not None: + return handler + for base in obj_type.__mro__[1:]: + handler = self._handlers.get(base) + if handler is not None: + logger.debug( + "PythonTypeSemanticHasherRegistry: resolved hasher for %s via base %s", + obj_type.__name__, + base.__name__, + ) + return handler + return None + + def get_semantic_hasher_for_type( + self, target_type: type + ) -> "PythonTypeSemanticHasherProtocol | None": + """Look up the hasher for a *type object* (rather than an instance). + + Args: + target_type: The type to look up. + + Returns: + The registered ``PythonTypeSemanticHasherProtocol``, or None. + """ + with self._lock: + handler = self._handlers.get(target_type) + if handler is not None: + return handler + for base in target_type.__mro__[1:]: + handler = self._handlers.get(base) + if handler is not None: + return handler + return None + + def has_semantic_hasher(self, target_type: type) -> bool: + """Return True if a hasher is registered for *target_type* or any MRO ancestor. + + Args: + target_type: The type to check. + """ + return self.get_semantic_hasher_for_type(target_type) is not None + + def registered_types(self) -> list[type]: + """Return a list of all directly-registered types (no MRO expansion).""" + with self._lock: + return list(self._handlers.keys()) + + def __repr__(self) -> str: + with self._lock: + names = [t.__name__ for t in self._handlers] + return f"PythonTypeSemanticHasherRegistry(registered={names!r})" + + def __len__(self) -> int: + with self._lock: + return len(self._handlers) + + +def get_default_python_type_semantic_hasher_registry() -> "PythonTypeSemanticHasherRegistry": + """Return the PythonTypeSemanticHasherRegistry from the default data context. + + This is a convenience wrapper; the registry is owned and versioned by the + active ``DataContext``. Importing this function from + ``orcapod.hashing.defaults`` or ``orcapod.hashing`` is equivalent. + """ + from orcapod.hashing.defaults import ( + get_default_python_type_semantic_hasher_registry as _get, + ) + return _get() + + +class BuiltinPythonTypeSemanticHasherRegistry(PythonTypeSemanticHasherRegistry): + """A PythonTypeSemanticHasherRegistry pre-populated with all built-in hashers. + + Constructed via the data context JSON spec so that the default registry + is versioned alongside the rest of the context components. + """ + + def __init__(self, arrow_hasher: "ArrowHasherProtocol | None" = None) -> None: + super().__init__() + from orcapod.hashing.semantic_hashing.builtin_handlers import ( + register_builtin_python_type_semantic_hashers, + ) + register_builtin_python_type_semantic_hashers(self, arrow_hasher=arrow_hasher) +``` + +- [ ] **Step 2: Commit** + +```bash +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + add src/orcapod/hashing/semantic_hashing/type_handler_registry.py +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + commit -m "refactor(type_handler_registry): rename to PythonTypeSemanticHasherRegistry, rename methods" +``` + +--- + +## Task 3: Rename + tighten all builtin handlers + +**Files:** +- Modify: `src/orcapod/hashing/semantic_hashing/builtin_handlers.py` + +- [ ] **Step 1: Write the complete updated file** + +Key changes: +- 11 class renames (all `*Handler`/`*ContentHandler` → `*SemanticHasher`) +- `handle(obj, hasher) -> Any` → `hash(obj, hasher) -> ContentHash` on every class +- `UUIDSemanticHasher`, `BytesSemanticHasher`, `FunctionSemanticHasher`, `TypeObjectSemanticHasher`, `SpecialFormSemanticHasher`, `GenericAliasSemanticHasher`, `UnionTypeSemanticHasher` now call `hasher.hash_object(...)` to return `ContentHash` directly +- `register_builtin_handlers` → `register_builtin_python_type_semantic_hashers` +- Remove `SemanticArrowHasher` fallback construction (it will be deleted); when `arrow_hasher is None`, skip registering `pa.Table`/`pa.RecordBatch` handlers + +```python +""" +Built-in PythonTypeSemanticHasherProtocol implementations. + + PathSemanticHasher -- pathlib.Path: file content hash + UPathSemanticHasher -- upath.UPath: file content hash (remote-aware) + UUIDSemanticHasher -- uuid.UUID: 16-byte binary representation + BytesSemanticHasher -- bytes/bytearray: hex string representation + FunctionSemanticHasher -- callable with __code__: via FunctionInfoExtractorProtocol + TypeObjectSemanticHasher -- type objects: stable "type:." string + SpecialFormSemanticHasher -- typing._SpecialForm + GenericAliasSemanticHasher -- generic alias type annotations + UnionTypeSemanticHasher -- types.UnionType (Python 3.10+ X | Y syntax) + ArrowTableSemanticHasher -- pa.Table / pa.RecordBatch + SchemaSemanticHasher -- Schema objects + +``register_builtin_python_type_semantic_hashers(registry)`` populates a registry +with all of the above. +""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import TYPE_CHECKING, Any +from uuid import UUID + +from upath import UPath + +from orcapod.types import ContentHash, PathLike, Schema + +if TYPE_CHECKING: + from orcapod.hashing.semantic_hashing.type_handler_registry import ( + PythonTypeSemanticHasherRegistry, + ) + from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher + from orcapod.protocols.hashing_protocols import ( + ArrowHasherProtocol, + FileContentHasherProtocol, + ) + +logger = logging.getLogger(__name__) + + +class PathSemanticHasher: + """Hasher for pathlib.Path objects — hashes file *content*. + + Args: + file_hasher: Any object with a ``hash_file(path) -> ContentHash`` method. + """ + + def __init__(self, file_hasher: "FileContentHasherProtocol") -> None: + self.file_hasher = file_hasher + + def hash(self, obj: PathLike, hasher: "SemanticAwarePythonHasher") -> ContentHash: + path: Path = Path(obj) + if not path.exists(): + raise FileNotFoundError( + f"PathSemanticHasher: path does not exist: {path!r}. " + "Paths must refer to existing files for content-based hashing." + ) + if path.is_dir(): + raise IsADirectoryError( + f"PathSemanticHasher: path is a directory: {path!r}. " + "Only regular files are supported for content-based hashing." + ) + logger.debug("PathSemanticHasher: hashing file content at %s", path) + return self.file_hasher.hash_file(path) + + +class UPathSemanticHasher: + """Hasher for universal_pathlib.UPath objects — hashes file content. + + Args: + file_hasher: Any object with a ``hash_file(path) -> ContentHash`` method. + """ + + def __init__(self, file_hasher: "FileContentHasherProtocol") -> None: + self.file_hasher = file_hasher + + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + if not isinstance(obj, UPath): + raise TypeError( + f"UPathSemanticHasher: expected a UPath, got {type(obj)!r}." + ) + if not obj.exists(): + raise FileNotFoundError( + f"UPathSemanticHasher: path does not exist: {obj!r}." + ) + if obj.is_dir(): + raise IsADirectoryError( + f"UPathSemanticHasher: path is a directory: {obj!r}." + ) + logger.debug("UPathSemanticHasher: hashing file content at %s", obj) + return self.file_hasher.hash_file(obj) + + +class UUIDSemanticHasher: + """Hasher for ``uuid.UUID`` objects — hashes the raw 16-byte binary representation.""" + + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + return hasher.hash_object(obj.bytes) + + +class BytesSemanticHasher: + """Hasher for bytes and bytearray objects — hashes the lowercase hex representation.""" + + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + if isinstance(obj, (bytes, bytearray)): + return hasher.hash_object(obj.hex()) + raise TypeError( + f"BytesSemanticHasher: expected bytes or bytearray, got {type(obj)!r}" + ) + + +class FunctionSemanticHasher: + """Hasher for Python functions/callables with a ``__code__`` attribute. + + Args: + function_info_extractor: Any object with an + ``extract_function_info(func) -> dict`` method. + """ + + def __init__(self, function_info_extractor: Any) -> None: + self.function_info_extractor = function_info_extractor + + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + if not (callable(obj) and hasattr(obj, "__code__")): + raise TypeError( + f"FunctionSemanticHasher: expected a callable with __code__, got {type(obj)!r}" + ) + func_name = getattr(obj, "__name__", repr(obj)) + logger.debug("FunctionSemanticHasher: extracting info for function %r", func_name) + info: dict[str, Any] = self.function_info_extractor.extract_function_info(obj) + return hasher.hash_object(info) + + +class TypeObjectSemanticHasher: + """Hasher for type objects (classes passed as values). + + Returns a stable string of the form ``"type:."``. + """ + + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + if not isinstance(obj, type): + raise TypeError( + f"TypeObjectSemanticHasher: expected a type/class, got {type(obj)!r}" + ) + module: str = obj.__module__ or "" + qualname: str = obj.__qualname__ + return hasher.hash_object(f"type:{module}.{qualname}") + + +class SpecialFormSemanticHasher: + """Hasher for ``typing._SpecialForm`` objects such as ``typing.Union``.""" + + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + name = getattr(obj, "_name", None) or repr(obj) + return hasher.hash_object(f"special_form:typing.{name}") + + +class GenericAliasSemanticHasher: + """Hasher for generic alias type annotations (``dict[int, str]``, ``Optional[X]``, etc.).""" + + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + import typing + + origin = getattr(obj, "__origin__", None) + args = getattr(obj, "__args__", None) or () + if origin is None: + return hasher.hash_object(f"generic_alias:{obj!r}") + if origin is typing.Union: + hashed_args = sorted(hasher.hash_object(arg).to_string() for arg in args) + return hasher.hash_object({"__type__": "union", "args": hashed_args}) + return hasher.hash_object({ + "__type__": "generic_alias", + "origin": hasher.hash_object(origin).to_string(), + "args": [hasher.hash_object(arg).to_string() for arg in args], + }) + + +class UnionTypeSemanticHasher: + """Hasher for ``types.UnionType`` objects (Python 3.10+ ``X | Y`` syntax).""" + + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + args = getattr(obj, "__args__", None) or () + hashed_args = sorted(hasher.hash_object(arg).to_string() for arg in args) + return hasher.hash_object({"__type__": "union", "args": hashed_args}) + + +class ArrowTableSemanticHasher: + """Hasher for ``pa.Table`` and ``pa.RecordBatch`` objects. + + Args: + arrow_hasher: Any object satisfying ``ArrowHasherProtocol``. + """ + + def __init__(self, arrow_hasher: "ArrowHasherProtocol") -> None: + self.arrow_hasher = arrow_hasher + + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + import pyarrow as _pa + + if isinstance(obj, _pa.RecordBatch): + obj = _pa.Table.from_batches([obj]) + if not isinstance(obj, _pa.Table): + raise TypeError( + f"ArrowTableSemanticHasher: expected pa.Table or pa.RecordBatch, got {type(obj)!r}" + ) + return self.arrow_hasher.hash_table(obj) + + +class SchemaSemanticHasher: + """Hasher for ``Schema`` objects.""" + + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + if not isinstance(obj, Schema): + raise TypeError( + f"SchemaSemanticHasher: expected a Schema, got {type(obj)!r}" + ) + raise NotImplementedError("SchemaSemanticHasher is not yet implemented.") + + +def register_builtin_python_type_semantic_hashers( + registry: "PythonTypeSemanticHasherRegistry", + file_hasher: Any = None, + function_info_extractor: Any = None, + arrow_hasher: "ArrowHasherProtocol | None" = None, +) -> None: + """Register all built-in semantic hashers into *registry*. + + When ``arrow_hasher`` is None, ``pa.Table`` and ``pa.RecordBatch`` handlers + are **not** registered (to avoid circular dependency in the JSON context + construction — the default context's ``python_type_semantic_hasher_registry`` + is built before ``arrow_hasher``). + + Args: + registry: The ``PythonTypeSemanticHasherRegistry`` to populate. + file_hasher: Optional ``FileContentHasherProtocol`` for path hashing. + Defaults to ``BasicFileHasher(sha256)``. + function_info_extractor: Optional ``FunctionInfoExtractorProtocol``. + Defaults to ``FunctionSignatureExtractor``. + arrow_hasher: Optional ``ArrowHasherProtocol`` for nested table hashing. + When None, Arrow table handlers are skipped. + """ + if file_hasher is None: + from orcapod.hashing.file_hashers import BasicFileHasher + file_hasher = BasicFileHasher(algorithm="sha256") + + if function_info_extractor is None: + from orcapod.hashing.semantic_hashing.function_info_extractors import ( + FunctionSignatureExtractor, + ) + function_info_extractor = FunctionSignatureExtractor( + include_module=True, + include_defaults=True, + ) + + bytes_hasher = BytesSemanticHasher() + registry.register(bytes, bytes_hasher) + registry.register(bytearray, bytes_hasher) + + registry.register(Path, PathSemanticHasher(file_hasher)) + registry.register(UPath, UPathSemanticHasher(file_hasher)) + registry.register(UUID, UUIDSemanticHasher()) + + import types as _types + + function_hasher = FunctionSemanticHasher(function_info_extractor) + registry.register(_types.FunctionType, function_hasher) + registry.register(_types.BuiltinFunctionType, function_hasher) + registry.register(_types.MethodType, function_hasher) + + registry.register(type, TypeObjectSemanticHasher()) + registry.register(_types.UnionType, UnionTypeSemanticHasher()) + + generic_alias_hasher = GenericAliasSemanticHasher() + registry.register(_types.GenericAlias, generic_alias_hasher) + try: + import typing as _typing + registry.register(_typing._GenericAlias, generic_alias_hasher) # type: ignore[attr-defined] + registry.register(_typing._SpecialForm, SpecialFormSemanticHasher()) # type: ignore[attr-defined] + except AttributeError: + pass + + registry.register(Schema, SchemaSemanticHasher()) + + if arrow_hasher is not None: + import pyarrow as _pa + arrow_table_hasher = ArrowTableSemanticHasher(arrow_hasher) + registry.register(_pa.Table, arrow_table_hasher) + registry.register(_pa.RecordBatch, arrow_table_hasher) + + logger.debug( + "register_builtin_python_type_semantic_hashers: registered %d hashers", + len(registry), + ) +``` + +- [ ] **Step 2: Commit** + +```bash +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + add src/orcapod/hashing/semantic_hashing/builtin_handlers.py +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + commit -m "refactor(builtin_handlers): rename handler classes, tighten hash() → ContentHash" +``` + +--- + +## Task 4: Rename `BaseSemanticHasher` → `SemanticAwarePythonHasher`, simplify dispatch + +**Files:** +- Modify: `src/orcapod/hashing/semantic_hashing/semantic_hasher.py` + +- [ ] **Step 1: Apply renames and simplify hash_object dispatch** + +Changes: +1. Class name `BaseSemanticHasher` → `SemanticAwarePythonHasher` +2. `__init__` parameter `type_handler_registry` → `type_semantic_hasher_registry` +3. `self._registry = get_default_type_handler_registry()` → `get_default_python_type_semantic_hasher_registry()` +4. `type_handler_registry` property → `type_semantic_hasher_registry` +5. Return type annotation `TypeHandlerRegistry` → `PythonTypeSemanticHasherRegistry` +6. `hash_object` dispatch: `get_handler` → `get_semantic_hasher`; remove double-wrap (handler now returns `ContentHash` directly) + +The dispatch block in `hash_object` changes from: +```python +handler = self._registry.get_handler(obj) +if handler is not None: + return self.hash_object(handler.handle(obj, self), resolver=resolver) +``` +to: +```python +semantic_hasher = self._registry.get_semantic_hasher(obj) +if semantic_hasher is not None: + return semantic_hasher.hash(obj, self) +``` + +Full updated file (only showing the changed parts — keep everything else identical): + +```python +# At top of file, update import: +from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeSemanticHasherRegistry + +# Class rename: +class SemanticAwarePythonHasher: + """ + Content-based recursive hasher. + [same docstring, just update BaseSemanticHasher references to SemanticAwarePythonHasher] + """ + + def __init__( + self, + hasher_id: str, + type_semantic_hasher_registry: PythonTypeSemanticHasherRegistry | None = None, + strict: bool = True, + ) -> None: + self._hasher_id = hasher_id + self._strict = strict + + if type_semantic_hasher_registry is None: + from orcapod.hashing.defaults import get_default_python_type_semantic_hasher_registry + self._registry = get_default_python_type_semantic_hasher_registry() + else: + self._registry = type_semantic_hasher_registry + + @property + def hasher_id(self) -> str: + return self._hasher_id + + @property + def strict(self) -> bool: + return self._strict + + @property + def type_semantic_hasher_registry(self) -> PythonTypeSemanticHasherRegistry: + """Return the ``PythonTypeSemanticHasherRegistry`` used by this hasher.""" + return self._registry + + def hash_object(self, obj, resolver=None): + # ... keep all existing logic, EXCEPT replace the handler dispatch block: + + # Old: + # handler = self._registry.get_handler(obj) + # if handler is not None: + # return self.hash_object(handler.handle(obj, self), resolver=resolver) + + # New: + # semantic_hasher = self._registry.get_semantic_hasher(obj) + # if semantic_hasher is not None: + # return semantic_hasher.hash(obj, self) + ... +``` + +The complete updated `hash_object` method (copy the full existing body, changing only the handler dispatch): + +```python +def hash_object( + self, + obj: Any, + resolver: Callable[[Any], ContentHash] | None = None, +) -> ContentHash: + """Hash *obj* based on its semantic content.""" + # Terminal: already a hash -- return as-is. + if isinstance(obj, ContentHash): + return obj + + # Primitives: hash their direct JSON representation. + if isinstance(obj, (type(None), bool, int, float, str)): + return self._hash_to_content_hash(obj) + + # Structures: expand into a tagged tree, then hash the tree. + if _is_structure(obj): + expanded = self._expand_structure( + obj, _visited=frozenset(), resolver=resolver + ) + return self._hash_to_content_hash(expanded) + + # Semantic hasher dispatch: the hasher produces a ContentHash directly. + semantic_hasher = self._registry.get_semantic_hasher(obj) + if semantic_hasher is not None: + logger.debug( + "hash_object: dispatching %s to semantic hasher %s", + type(obj).__name__, + type(semantic_hasher).__name__, + ) + return semantic_hasher.hash(obj, self) + + # ContentIdentifiableProtocol: use resolver if provided, else content_hash(). + if isinstance(obj, hp.ContentIdentifiableProtocol): + if resolver is not None: + logger.debug( + "hash_object: resolving ContentIdentifiableProtocol %s via resolver", + type(obj).__name__, + ) + return resolver(obj) + else: + logger.debug( + "hash_object: using ContentIdentifiableProtocol %s's content_hash", + type(obj).__name__, + ) + return obj.content_hash() + + # Fallback for unhandled types. + fallback = self._handle_unknown(obj) + return self._hash_to_content_hash(fallback) +``` + +- [ ] **Step 2: Commit** + +```bash +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + add src/orcapod/hashing/semantic_hashing/semantic_hasher.py +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + commit -m "refactor(semantic_hasher): rename BaseSemanticHasher → SemanticAwarePythonHasher, simplify dispatch" +``` + +--- + +## Task 5: Update `content_identifiable_mixin.py` and `contexts/core.py` + +**Files:** +- Modify: `src/orcapod/hashing/semantic_hashing/content_identifiable_mixin.py` +- Modify: `src/orcapod/contexts/core.py` + +- [ ] **Step 1: Update `content_identifiable_mixin.py`** + +Three changes: +1. Line 68: `from orcapod.hashing.semantic_hashing.semantic_hasher import BaseSemanticHasher` → `SemanticAwarePythonHasher` +2. Line 97: parameter `semantic_hasher: BaseSemanticHasher | None` → `SemanticAwarePythonHasher | None` +3. Line 218 (approximately): `def _get_hasher(self) -> BaseSemanticHasher:` → `SemanticAwarePythonHasher` +4. Update the class docstring reference from `BaseSemanticHasher` to `SemanticAwarePythonHasher` + +```python +# Old line 68: +from orcapod.hashing.semantic_hashing.semantic_hasher import BaseSemanticHasher + +# New: +from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher +``` + +```python +# Old __init__ signature: +def __init__( + self, *, semantic_hasher: BaseSemanticHasher | None = None, **kwargs: Any +) -> None: + +# New: +def __init__( + self, *, semantic_hasher: SemanticAwarePythonHasher | None = None, **kwargs: Any +) -> None: +``` + +Also update the `_get_hasher` return type annotation and any docstring mentions of `BaseSemanticHasher`. + +- [ ] **Step 2: Update `contexts/core.py` docstring** + +Update the `DataContext` docstring — replace `semantic_hasher.type_handler_registry` with `semantic_hasher.type_semantic_hasher_registry`: + +```python +@dataclass +class DataContext: + """Data context containing all versioned components needed for data interpretation. + + Attributes: + context_key: Unique identifier (e.g., "std:v0.1:default") + version: Version string (e.g., "v0.1") + description: Human-readable description + type_converter: Type converter for Python ↔ Arrow conversion and registration. + arrow_hasher: Arrow table hasher for this context. + semantic_hasher: General semantic hasher for this context. The + ``PythonTypeSemanticHasherRegistry`` used for hashing is accessible via + ``semantic_hasher.type_semantic_hasher_registry``. + """ +``` + +- [ ] **Step 3: Commit** + +```bash +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + add src/orcapod/hashing/semantic_hashing/content_identifiable_mixin.py \ + src/orcapod/contexts/core.py +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + commit -m "refactor: update BaseSemanticHasher → SemanticAwarePythonHasher refs in mixin and core" +``` + +--- + +## Task 6: Update `__init__.py` exports and `defaults.py` + +**Files:** +- Modify: `src/orcapod/hashing/semantic_hashing/__init__.py` +- Modify: `src/orcapod/hashing/__init__.py` +- Modify: `src/orcapod/hashing/defaults.py` + +- [ ] **Step 1: Update `semantic_hashing/__init__.py`** + +```python +""" +orcapod.hashing.semantic_hashing +================================= + SemanticAwarePythonHasher -- content-based recursive object hasher + PythonTypeSemanticHasherRegistry -- MRO-aware registry mapping types → PythonTypeSemanticHasherProtocol + BuiltinPythonTypeSemanticHasherRegistry -- pre-populated registry with built-in hashers + ContentIdentifiableMixin -- convenience mixin for content-identifiable objects + +Built-in PythonTypeSemanticHasherProtocol implementations: + PathSemanticHasher -- pathlib.Path → file-content hash + UUIDSemanticHasher -- uuid.UUID → canonical bytes + BytesSemanticHasher -- bytes/bytearray → hex string + FunctionSemanticHasher -- callable → via FunctionInfoExtractorProtocol + TypeObjectSemanticHasher -- type objects → "type:." + register_builtin_python_type_semantic_hashers -- populate a registry with all of the above + +Function info extractors (used by FunctionSemanticHasher): + FunctionNameExtractor + FunctionSignatureExtractor + FunctionInfoExtractorFactory +""" + +from orcapod.hashing.semantic_hashing.builtin_handlers import ( + BytesSemanticHasher, + FunctionSemanticHasher, + PathSemanticHasher, + TypeObjectSemanticHasher, + UUIDSemanticHasher, + register_builtin_python_type_semantic_hashers, +) +from orcapod.hashing.semantic_hashing.content_identifiable_mixin import ( + ContentIdentifiableMixin, +) +from orcapod.hashing.semantic_hashing.function_info_extractors import ( + FunctionInfoExtractorFactory, + FunctionNameExtractor, + FunctionSignatureExtractor, +) +from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher +from orcapod.hashing.semantic_hashing.type_handler_registry import ( + BuiltinPythonTypeSemanticHasherRegistry, + PythonTypeSemanticHasherRegistry, +) + +__all__ = [ + "SemanticAwarePythonHasher", + "PythonTypeSemanticHasherRegistry", + "BuiltinPythonTypeSemanticHasherRegistry", + "ContentIdentifiableMixin", + "PathSemanticHasher", + "UUIDSemanticHasher", + "BytesSemanticHasher", + "FunctionSemanticHasher", + "TypeObjectSemanticHasher", + "register_builtin_python_type_semantic_hashers", + "FunctionNameExtractor", + "FunctionSignatureExtractor", + "FunctionInfoExtractorFactory", +] +``` + +- [ ] **Step 2: Update `hashing/__init__.py`** + +```python +""" +OrcaPod hashing package. + +Public API +---------- + SemanticAwarePythonHasher -- content-based recursive object hasher + SemanticHasherProtocol -- protocol for semantic hashers + PythonTypeSemanticHasherRegistry -- registry mapping types to PythonTypeSemanticHasherProtocol instances + get_default_semantic_hasher -- global default SemanticHasherProtocol factory + get_default_python_type_semantic_hasher_registry -- global default registry factory + ContentIdentifiableMixin -- convenience mixin for content-identifiable objects + +Built-in hashers (importable for custom registry setup): + PathSemanticHasher + UUIDSemanticHasher + BytesSemanticHasher + FunctionSemanticHasher + TypeObjectSemanticHasher + register_builtin_python_type_semantic_hashers + +Utility: + FileContentHasherProtocol + StringCacherProtocol + FunctionInfoExtractorProtocol + ArrowHasherProtocol +""" + +from orcapod.hashing.defaults import ( + get_default_arrow_hasher, + get_default_python_type_semantic_hasher_registry, + get_default_semantic_hasher, +) +from orcapod.hashing.file_hashers import BasicFileHasher, CachedFileHasher +from orcapod.hashing.hash_utils import hash_file +from orcapod.hashing.semantic_hashing.builtin_handlers import ( + BytesSemanticHasher, + FunctionSemanticHasher, + PathSemanticHasher, + TypeObjectSemanticHasher, + UUIDSemanticHasher, + register_builtin_python_type_semantic_hashers, +) +from orcapod.hashing.semantic_hashing.content_identifiable_mixin import ( + ContentIdentifiableMixin, +) +from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher +from orcapod.hashing.semantic_hashing.type_handler_registry import ( + BuiltinPythonTypeSemanticHasherRegistry, + PythonTypeSemanticHasherRegistry, +) +from orcapod.protocols.hashing_protocols import ( + ArrowHasherProtocol, + ContentIdentifiableProtocol, + FileContentHasherProtocol, + FunctionInfoExtractorProtocol, + PythonTypeSemanticHasherProtocol, + SemanticHasherProtocol, + SemanticTypeHasherProtocol, + StringCacherProtocol, +) + +try: + from orcapod.hashing.legacy_core import ( + HashableMixin, + function_content_hash, + get_function_signature, + hash_function, + hash_data, + hash_pathset, + hash_to_hex, + hash_to_int, + hash_to_uuid, + ) +except ImportError: + HashableMixin = None # type: ignore[assignment,misc] + function_content_hash = None # type: ignore[assignment] + get_function_signature = None # type: ignore[assignment] + hash_function = None # type: ignore[assignment] + hash_data = None # type: ignore[assignment] + hash_pathset = None # type: ignore[assignment] + hash_to_hex = None # type: ignore[assignment] + hash_to_int = None # type: ignore[assignment] + hash_to_uuid = None # type: ignore[assignment] + +__all__ = [ + "SemanticAwarePythonHasher", + "PythonTypeSemanticHasherRegistry", + "BuiltinPythonTypeSemanticHasherRegistry", + "get_default_python_type_semantic_hasher_registry", + "get_default_semantic_hasher", + "ContentIdentifiableMixin", + "PathSemanticHasher", + "UUIDSemanticHasher", + "BytesSemanticHasher", + "FunctionSemanticHasher", + "TypeObjectSemanticHasher", + "register_builtin_python_type_semantic_hashers", + "SemanticHasherProtocol", + "ContentIdentifiableProtocol", + "PythonTypeSemanticHasherProtocol", + "FileContentHasherProtocol", + "ArrowHasherProtocol", + "StringCacherProtocol", + "FunctionInfoExtractorProtocol", + "SemanticTypeHasherProtocol", + "BasicFileHasher", + "CachedFileHasher", + "hash_file", + "get_default_arrow_hasher", + "HashableMixin", + "hash_to_hex", + "hash_to_int", + "hash_to_uuid", + "hash_function", + "get_function_signature", + "function_content_hash", + "hash_pathset", + "hash_data", +] +``` + +- [ ] **Step 3: Update `hashing/defaults.py`** + +```python +# Default hasher accessors for the OrcaPod hashing system. + +from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeSemanticHasherRegistry +from orcapod.protocols import hashing_protocols as hp + + +def get_default_python_type_semantic_hasher_registry() -> PythonTypeSemanticHasherRegistry: + """Return the PythonTypeSemanticHasherRegistry from the default data context's semantic hasher. + + Returns: + PythonTypeSemanticHasherRegistry: The registry from the default data context. + """ + from orcapod.contexts import get_default_context + return get_default_context().semantic_hasher.type_semantic_hasher_registry + + +def get_default_semantic_hasher() -> hp.SemanticHasherProtocol: + """Return the SemanticHasherProtocol from the default data context.""" + from orcapod.contexts import get_default_context + return get_default_context().semantic_hasher + + +def get_default_arrow_hasher() -> hp.ArrowHasherProtocol: + """Return the ArrowHasherProtocol from the default data context. + + Note: file-hash caching (formerly via ``set_cacher``) has been removed. + ``StarfixArrowHasher`` does not support per-path caching. Use + ``CachedFileHasher`` when constructing a custom context if caching is needed. + """ + from orcapod.contexts import get_default_context + return get_default_context().arrow_hasher +``` + +- [ ] **Step 4: Commit** + +```bash +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + add src/orcapod/hashing/semantic_hashing/__init__.py \ + src/orcapod/hashing/__init__.py \ + src/orcapod/hashing/defaults.py +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + commit -m "refactor(hashing): update __init__.py exports and defaults for rename" +``` + +--- + +## Task 7: Update `test_semantic_hasher.py` → run tests + +**Files:** +- Modify: `tests/test_hashing/test_semantic_hasher.py` + +- [ ] **Step 1: Update imports at the top of the file** + +```python +# Old: +from orcapod.hashing.semantic_hashing.builtin_handlers import register_builtin_handlers +from orcapod.hashing.semantic_hashing.semantic_hasher import ( + BaseSemanticHasher, + _is_namedtuple, +) +from orcapod.hashing.semantic_hashing.type_handler_registry import ( + TypeHandlerRegistry, + get_default_type_handler_registry, +) + +# New: +from orcapod.hashing.semantic_hashing.builtin_handlers import ( + register_builtin_python_type_semantic_hashers, +) +from orcapod.hashing.semantic_hashing.semantic_hasher import ( + SemanticAwarePythonHasher, + _is_namedtuple, +) +from orcapod.hashing.semantic_hashing.type_handler_registry import ( + PythonTypeSemanticHasherRegistry, + get_default_python_type_semantic_hasher_registry, +) +``` + +- [ ] **Step 2: Update `make_hasher()` fixture and type annotations** + +```python +def make_hasher(strict: bool = True) -> SemanticAwarePythonHasher: + """Create a fresh SemanticAwarePythonHasher with an isolated registry.""" + registry = PythonTypeSemanticHasherRegistry() + register_builtin_python_type_semantic_hashers(registry) + return SemanticAwarePythonHasher( + hasher_id="test_v1", type_semantic_hasher_registry=registry, strict=strict + ) + + +@pytest.fixture +def hasher() -> SemanticAwarePythonHasher: + return make_hasher(strict=True) + + +@pytest.fixture +def lenient_hasher() -> SemanticAwarePythonHasher: + return make_hasher(strict=False) +``` + +- [ ] **Step 3: Update `_DummyHandler` in `TestTypeHandlerRegistry` (near line 827)** + +```python +# Old: +class _DummyHandler: + def __init__(self, tag: str) -> None: + self.tag = tag + + def handle(self, obj: Any, hasher: Any) -> Any: + return f"{self.tag}:{obj}" + +# New: +class _DummySemanticHasher: + def __init__(self, tag: str) -> None: + self.tag = tag + + def hash(self, obj: Any, hasher: Any) -> Any: + # Returns a ContentHash by delegating to the outer hasher + return hasher.hash_object(f"{self.tag}:{obj}") +``` + +- [ ] **Step 4: Update `TestTypeHandlerRegistry` class — rename class, method calls, and dummy handler** + +Rename the test class to `TestPythonTypeSemanticHasherRegistry` and update every reference: +- `TypeHandlerRegistry()` → `PythonTypeSemanticHasherRegistry()` +- `_DummyHandler(...)` → `_DummySemanticHasher(...)` +- `reg.get_handler(...)` → `reg.get_semantic_hasher(...)` +- `reg.has_handler(...)` → `reg.has_semantic_hasher(...)` +- `reg.get_handler_for_type(...)` → `reg.get_semantic_hasher_for_type(...)` + +Example of updated test methods: +```python +class TestPythonTypeSemanticHasherRegistry: + def test_register_and_get_exact(self): + reg = PythonTypeSemanticHasherRegistry() + h = _DummySemanticHasher("base") + reg.register(Base, h) + assert reg.get_semantic_hasher(Base()) is h + + def test_mro_lookup_child(self): + reg = PythonTypeSemanticHasherRegistry() + h = _DummySemanticHasher("base") + reg.register(Base, h) + assert reg.get_semantic_hasher(Child()) is h + + def test_mro_lookup_grandchild(self): + reg = PythonTypeSemanticHasherRegistry() + h = _DummySemanticHasher("base") + reg.register(Base, h) + assert reg.get_semantic_hasher(GrandChild()) is h + + def test_more_specific_handler_wins(self): + reg = PythonTypeSemanticHasherRegistry() + h_base = _DummySemanticHasher("base") + h_child = _DummySemanticHasher("child") + reg.register(Base, h_base) + reg.register(Child, h_child) + assert reg.get_semantic_hasher(Child()) is h_child + assert reg.get_semantic_hasher(GrandChild()) is h_child + + def test_unregistered_returns_none(self): + reg = PythonTypeSemanticHasherRegistry() + assert reg.get_semantic_hasher(Base()) is None + + def test_unregister_removes_handler(self): + reg = PythonTypeSemanticHasherRegistry() + h = _DummySemanticHasher("base") + reg.register(Base, h) + assert reg.unregister(Base) is True + assert reg.get_semantic_hasher(Base()) is None + + def test_unregister_nonexistent_returns_false(self): + reg = PythonTypeSemanticHasherRegistry() + assert reg.unregister(Base) is False + + def test_replace_existing_handler(self): + reg = PythonTypeSemanticHasherRegistry() + h1 = _DummySemanticHasher("first") + h2 = _DummySemanticHasher("second") + reg.register(Base, h1) + reg.register(Base, h2) + assert reg.get_semantic_hasher(Base()) is h2 + + def test_register_non_type_raises(self): + reg = PythonTypeSemanticHasherRegistry() + with pytest.raises(TypeError): + reg.register("not_a_type", _DummySemanticHasher("x")) # type: ignore[arg-type] + + def test_has_semantic_hasher_exact(self): + reg = PythonTypeSemanticHasherRegistry() + reg.register(Base, _DummySemanticHasher("b")) + assert reg.has_semantic_hasher(Base) is True + + def test_has_semantic_hasher_via_mro(self): + reg = PythonTypeSemanticHasherRegistry() + reg.register(Base, _DummySemanticHasher("b")) + assert reg.has_semantic_hasher(Child) is True + + def test_has_semantic_hasher_false(self): + reg = PythonTypeSemanticHasherRegistry() + assert reg.has_semantic_hasher(Base) is False + + def test_registered_types_snapshot(self): + reg = PythonTypeSemanticHasherRegistry() + reg.register(Base, _DummySemanticHasher("b")) + reg.register(Child, _DummySemanticHasher("c")) + types = reg.registered_types() + assert Base in types + assert Child in types + + def test_len(self): + reg = PythonTypeSemanticHasherRegistry() + assert len(reg) == 0 + reg.register(Base, _DummySemanticHasher("b")) + assert len(reg) == 1 + reg.register(Child, _DummySemanticHasher("c")) + assert len(reg) == 2 + + def test_get_semantic_hasher_for_type(self): + reg = PythonTypeSemanticHasherRegistry() + h = _DummySemanticHasher("b") + reg.register(Base, h) + assert reg.get_semantic_hasher_for_type(Base) is h + assert reg.get_semantic_hasher_for_type(Child) is h # via MRO + assert reg.get_semantic_hasher_for_type(int) is None +``` + +Also update any remaining references in the file body to `get_default_type_handler_registry` → `get_default_python_type_semantic_hasher_registry`, and any fixture type annotations. + +- [ ] **Step 5: Run tests** + +```bash +uv run --project /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + pytest tests/test_hashing/test_semantic_hasher.py -x -v +``` + +Expected: all tests pass. + +- [ ] **Step 6: Commit** + +```bash +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + add tests/test_hashing/test_semantic_hasher.py +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + commit -m "test(semantic_hasher): update for registry rename and hash() protocol tightening" +``` + +--- + +## Task 8: Add `visit_extension` to `ArrowTypeDataVisitor` + rewrite `SemanticHashingVisitor` + +**Files:** +- Modify: `src/orcapod/hashing/visitors.py` + +- [ ] **Step 1: Write a failing test for `visit_extension` dispatch** + +Create `tests/test_hashing/test_extension_type_hashing.py`: + +```python +"""Tests for extension type column hashing via SemanticHashingVisitor.""" + +from __future__ import annotations + +import pyarrow as pa +import pytest +from pathlib import Path + +from orcapod.hashing.visitors import SemanticHashingVisitor +from orcapod.contexts import get_default_context + + +@pytest.fixture +def ctx(): + return get_default_context() + + +class TestArrowTypeDataVisitorExtension: + def test_visit_dispatches_to_visit_extension_for_extension_types(self, ctx): + """visit() routes ExtensionType columns to visit_extension(), not visit_struct().""" + arrow_type = ctx.type_converter.register_python_class(Path) + assert isinstance(arrow_type, pa.ExtensionType), ( + "Path must be registered as an Arrow extension type" + ) + + calls = [] + + class TrackingVisitor(SemanticHashingVisitor): + def visit_extension(self, ext_type, storage_value): + calls.append("visit_extension") + return super().visit_extension(ext_type, storage_value) + + def visit_struct(self, struct_type, data): + calls.append("visit_struct") + return super().visit_struct(struct_type, data) + + visitor = TrackingVisitor(ctx.type_converter, ctx.semantic_hasher) + # Any value is fine for this dispatch test — use a dummy string (storage for Path is str) + visitor.visit(arrow_type, "/tmp/dummy") + assert "visit_extension" in calls + assert "visit_struct" not in calls + + +class TestSemanticHashingVisitorExtension: + def test_path_column_hashed_to_large_binary(self, ctx, tmp_path): + """Path extension columns are replaced with pa.large_binary() hash tokens.""" + file = tmp_path / "test.txt" + file.write_text("hello") + + arrow_type = ctx.type_converter.register_python_class(Path) + storage_val = ctx.type_converter.python_to_storage(Path(file), Path) + + visitor = SemanticHashingVisitor(ctx.type_converter, ctx.semantic_hasher) + new_type, new_data = visitor.visit(arrow_type, storage_val) + + assert new_type == pa.large_binary() + assert isinstance(new_data, bytes) + + def test_same_content_same_hash(self, ctx, tmp_path): + """Two paths pointing to files with identical content produce the same hash bytes.""" + file1 = tmp_path / "a.txt" + file2 = tmp_path / "b.txt" + file1.write_text("identical content") + file2.write_text("identical content") + + arrow_type = ctx.type_converter.register_python_class(Path) + storage1 = ctx.type_converter.python_to_storage(Path(file1), Path) + storage2 = ctx.type_converter.python_to_storage(Path(file2), Path) + + visitor = SemanticHashingVisitor(ctx.type_converter, ctx.semantic_hasher) + _, hash1 = visitor.visit(arrow_type, storage1) + _, hash2 = visitor.visit(arrow_type, storage2) + + assert hash1 == hash2 + + def test_different_content_different_hash(self, ctx, tmp_path): + """Files with different content produce different hash bytes.""" + file1 = tmp_path / "x.txt" + file2 = tmp_path / "y.txt" + file1.write_text("content A") + file2.write_text("content B") + + arrow_type = ctx.type_converter.register_python_class(Path) + storage1 = ctx.type_converter.python_to_storage(Path(file1), Path) + storage2 = ctx.type_converter.python_to_storage(Path(file2), Path) + + visitor = SemanticHashingVisitor(ctx.type_converter, ctx.semantic_hasher) + _, hash1 = visitor.visit(arrow_type, storage1) + _, hash2 = visitor.visit(arrow_type, storage2) + + assert hash1 != hash2 + + def test_binary_encoding_format(self, ctx, tmp_path): + """Hash bytes have format b':::'.""" + file = tmp_path / "test.txt" + file.write_text("test") + + arrow_type = ctx.type_converter.register_python_class(Path) + storage_val = ctx.type_converter.python_to_storage(Path(file), Path) + + visitor = SemanticHashingVisitor(ctx.type_converter, ctx.semantic_hasher) + _, hash_bytes = visitor.visit(arrow_type, storage_val) + + assert b"::" in hash_bytes + type_prefix, hash_part = hash_bytes.split(b"::", 1) + # Extension name "orcapod.path" → dots replaced with colons + assert type_prefix == b"orcapod:path" + # hash_part should be "method:digest" — at least one colon + assert b":" in hash_part + + def test_null_value_passthrough(self, ctx): + """Null storage values pass through as-is.""" + arrow_type = ctx.type_converter.register_python_class(Path) + + visitor = SemanticHashingVisitor(ctx.type_converter, ctx.semantic_hasher) + new_type, new_data = visitor.visit(arrow_type, None) + + assert new_type == arrow_type + assert new_data is None +``` + +- [ ] **Step 2: Run tests — verify they fail** + +```bash +uv run --project /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + pytest tests/test_hashing/test_extension_type_hashing.py -x -v +``` + +Expected: ImportError or AttributeError (methods don't exist yet). + +- [ ] **Step 3: Rewrite `visitors.py`** + +```python +""" +Generic visitor pattern for traversing Arrow types and data simultaneously. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any + +from orcapod.utils.lazy_module import LazyModule + +if TYPE_CHECKING: + import pyarrow as pa + from orcapod.semantic_types.universal_converter import UniversalTypeConverter + from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher +else: + pa = LazyModule("pyarrow") + + +class ArrowTypeDataVisitor(ABC): + """Base visitor for traversing Arrow types and data simultaneously.""" + + @abstractmethod + def visit_struct( + self, struct_type: "pa.StructType", data: dict | None + ) -> tuple["pa.DataType", Any]: + """Visit a struct type with its data.""" + pass + + @abstractmethod + def visit_list( + self, list_type: "pa.ListType", data: list | None + ) -> tuple["pa.DataType", Any]: + """Visit a list type with its data.""" + pass + + @abstractmethod + def visit_map( + self, map_type: "pa.MapType", data: dict | None + ) -> tuple["pa.DataType", Any]: + """Visit a map type with its data.""" + pass + + @abstractmethod + def visit_primitive( + self, primitive_type: "pa.DataType", data: Any + ) -> tuple["pa.DataType", Any]: + """Visit a primitive type with its data.""" + pass + + def visit_extension( + self, + extension_type: "pa.ExtensionType", + storage_value: Any, + ) -> tuple["pa.DataType", Any]: + """Handle an Arrow extension type. + + Default implementation: passthrough — preserves the extension type and its + storage value unchanged so that the downstream ``StarfixArrowHasher`` / + ``ArrowDigester`` sees the full extension metadata when it receives the + pre-processed table. + + Subclasses may override to convert recognised extension types to a hashed + ``pa.large_binary()`` value. + + Args: + extension_type: The Arrow extension type. + storage_value: The storage-level value (result of ``to_pylist()`` on the column). + + Returns: + Tuple of ``(new_arrow_type, new_data)``. + """ + return extension_type, storage_value + + def visit(self, arrow_type: "pa.DataType", data: Any) -> tuple["pa.DataType", Any]: + """Main dispatch method that routes to the appropriate visit method. + + Extension types are checked **first** — before the struct check — because + extension types with struct storage would otherwise be incorrectly routed + into ``visit_struct``. After ``visit_extension``, the result is re-visited + only if the type changed AND is no longer an extension type (enables + composability, avoids infinite recursion). + + Args: + arrow_type: Arrow data type to process. + data: Corresponding data value. + + Returns: + Tuple of ``(new_arrow_type, new_data)``. + """ + if isinstance(arrow_type, pa.ExtensionType): + new_type, new_data = self.visit_extension(arrow_type, data) + if new_type is not arrow_type and not isinstance(new_type, pa.ExtensionType): + return self.visit(new_type, new_data) + return new_type, new_data + + if pa.types.is_struct(arrow_type): + return self.visit_struct(arrow_type, data) + elif pa.types.is_list(arrow_type) or pa.types.is_large_list(arrow_type): + return self.visit_list(arrow_type, data) + elif pa.types.is_fixed_size_list(arrow_type): + return self.visit_list(arrow_type, data) + elif pa.types.is_map(arrow_type): + return self.visit_map(arrow_type, data) + else: + return self.visit_primitive(arrow_type, data) + + def _visit_struct_fields( + self, struct_type: "pa.StructType", data: dict | None + ) -> tuple["pa.StructType", dict]: + """Recursively process struct fields. Default behavior for regular structs.""" + if data is None: + return struct_type, None + + new_fields = [] + new_data = {} + + for field in struct_type: + field_data = data.get(field.name) + new_field_type, new_field_data = self.visit(field.type, field_data) + new_fields.append(pa.field(field.name, new_field_type)) + new_data[field.name] = new_field_data + + return pa.struct(new_fields), new_data + + def _visit_list_elements( + self, list_type: "pa.ListType", data: list | None + ) -> tuple["pa.DataType", list]: + """Recursively process list elements.""" + if data is None: + return list_type, None + + element_type = list_type.value_type + processed_elements = [] + new_element_type = None + + for item in data: + current_element_type, processed_item = self.visit(element_type, item) + processed_elements.append(processed_item) + if new_element_type is None: + new_element_type = current_element_type + + if new_element_type is None: + new_element_type = element_type + + if pa.types.is_large_list(list_type): + return pa.large_list(new_element_type), processed_elements + elif pa.types.is_fixed_size_list(list_type): + return pa.list_(new_element_type, list_type.list_size), processed_elements + else: + return pa.list_(new_element_type), processed_elements + + +class SemanticHashingError(Exception): + """Exception raised when semantic hashing fails.""" + pass + + +class SemanticHashingVisitor(ArrowTypeDataVisitor): + """Visitor that replaces extension-typed columns with their content hashes. + + For each Arrow column whose type is a ``pa.ExtensionType``: + + 1. Look up the corresponding Python type via ``type_converter``. + 2. If the Python type has a semantic hasher registered in ``python_hasher``, + convert the storage value to a Python object and hash it, replacing the + column with a ``pa.large_binary()`` value of the form:: + + + b"::" + content_hash.to_prefixed_digest() + + where ``type_name`` is the extension name with dots replaced by colons + (e.g. ``"orcapod.path"`` → ``"orcapod:path"``), and + ``to_prefixed_digest()`` = ``method_bytes + b":" + digest``. + 3. If no hasher is registered (or the converter doesn't know the type), + return the extension type and storage value unchanged. The downstream + ``StarfixArrowHasher`` / ``ArrowDigester`` will see the full extension + metadata intact and hash it in a type-aware way. + + Args: + type_converter: The active ``UniversalTypeConverter`` for resolving + extension type → Python type and storage → Python conversion. + python_hasher: The active ``SemanticAwarePythonHasher`` for hashing + Python objects. + """ + + def __init__( + self, + type_converter: "UniversalTypeConverter", + python_hasher: "SemanticAwarePythonHasher", + ) -> None: + self._type_converter = type_converter + self._python_hasher = python_hasher + self._current_field_path: list[str] = [] + + def visit_extension( + self, + extension_type: "pa.ExtensionType", + storage_value: Any, + ) -> tuple["pa.DataType", Any]: + """Hash an extension type value to pa.large_binary(), or passthrough.""" + if storage_value is None: + return extension_type, None + + from typing import Any as _Any + + # Resolve extension type → Python type. + python_type = self._type_converter.arrow_type_to_python_type(extension_type) + + # If the converter couldn't resolve to a concrete class, passthrough. + if python_type is _Any or not isinstance(python_type, type): + return extension_type, storage_value + + # Only hash if a semantic hasher is registered for this Python type. + if not self._python_hasher.type_semantic_hasher_registry.has_semantic_hasher( + python_type + ): + return extension_type, storage_value + + # Convert storage value → Python object and hash it. + python_obj = self._type_converter.storage_to_python(storage_value, python_type) + content_hash = self._python_hasher.hash_object(python_obj) + + # Encode as binary: ":::" + # Dots in the extension name → colons (e.g. "orcapod.path" → "orcapod:path"). + # The "::" separator is unambiguous because to_prefixed_digest() uses only ":". + type_name = extension_type.extension_name.replace(".", ":") + hash_bytes = ( + type_name.encode("ascii") + + b"::" + + content_hash.to_prefixed_digest() + ) + return pa.large_binary(), hash_bytes + + def visit_struct( + self, struct_type: "pa.StructType", data: dict | None + ) -> tuple["pa.DataType", Any]: + """Regular struct (no extension identity) — recurse into fields.""" + if data is None: + return struct_type, None + return self._visit_struct_fields(struct_type, data) + + def visit_list( + self, list_type: "pa.ListType", data: list | None + ) -> tuple["pa.DataType", Any]: + """Recurse into list elements.""" + if data is None: + return list_type, None + self._current_field_path.append("[*]") + try: + return self._visit_list_elements(list_type, data) + finally: + self._current_field_path.pop() + + def visit_map( + self, map_type: "pa.MapType", data: dict | None + ) -> tuple["pa.DataType", Any]: + """Pass map types through unchanged.""" + return map_type, data + + def visit_primitive( + self, primitive_type: "pa.DataType", data: Any + ) -> tuple["pa.DataType", Any]: + """Pass primitive types through unchanged.""" + return primitive_type, data + + def _visit_struct_fields( + self, struct_type: "pa.StructType", data: dict | None + ) -> tuple["pa.StructType", dict]: + """Override to add field path tracking for better error messages.""" + if data is None: + return struct_type, None + + new_fields = [] + new_data = {} + + for field in struct_type: + self._current_field_path.append(field.name) + try: + field_data = data.get(field.name) + new_field_type, new_field_data = self.visit(field.type, field_data) + new_fields.append(pa.field(field.name, new_field_type)) + new_data[field.name] = new_field_data + finally: + self._current_field_path.pop() + + return pa.struct(new_fields), new_data +``` + +- [ ] **Step 4: Run tests — verify they pass** + +```bash +uv run --project /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + pytest tests/test_hashing/test_extension_type_hashing.py -x -v +``` + +Expected: all tests pass. + +- [ ] **Step 5: Commit** + +```bash +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + add src/orcapod/hashing/visitors.py \ + tests/test_hashing/test_extension_type_hashing.py +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + commit -m "feat(visitors): add visit_extension dispatch; rewrite SemanticHashingVisitor for extension types" +``` + +--- + +## Task 9: Update `StarfixArrowHasher`, delete `SemanticArrowHasher` + +**Files:** +- Modify: `src/orcapod/hashing/arrow_hashers.py` + +- [ ] **Step 1: Rewrite `arrow_hashers.py`** + +Delete the entire `SemanticArrowHasher` class. Update `StarfixArrowHasher`: + +```python +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +import pyarrow as pa +from starfix import ArrowDigester + +from orcapod.hashing.schema_cleaner import clean_schema_for_hashing, has_extension_metadata +from orcapod.hashing.visitors import SemanticHashingVisitor +from orcapod.types import ContentHash +from orcapod.utils import arrow_utils + +if TYPE_CHECKING: + from orcapod.semantic_types.universal_converter import UniversalTypeConverter + from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher + + +class StarfixArrowHasher: + """Arrow table hasher backed by the starfix-python ``ArrowDigester``. + + Pipeline + -------- + 1. **Semantic pre-processing** — the ``SemanticHashingVisitor`` traverses + every column. Extension-typed columns whose Python type has a registered + semantic hasher are replaced with ``pa.large_binary()`` hash tokens + (e.g. ``Path`` columns are replaced by their file-content hash). + Extension-typed columns without a registered hasher pass through with + their full extension metadata intact. + 2. **Starfix hashing** — ``ArrowDigester.hash_table`` produces a 35-byte + versioned SHA-256 digest that is byte-for-byte identical to the Rust + ``starfix`` crate output. + + Parameters + ---------- + type_converter: + ``UniversalTypeConverter`` used to resolve extension types to Python + types and convert storage values back to Python objects. + semantic_hasher: + ``SemanticAwarePythonHasher`` used to hash Python objects extracted + from extension-typed columns. + hasher_id: + String identifier embedded in every ``ContentHash`` produced by this + hasher. + """ + + def __init__( + self, + type_converter: "UniversalTypeConverter", + semantic_hasher: "SemanticAwarePythonHasher", + hasher_id: str, + ) -> None: + self._type_converter = type_converter + self._semantic_hasher = semantic_hasher + self._hasher_id = hasher_id + + @property + def hasher_id(self) -> str: + return self._hasher_id + + def _process_table_columns(self, table: "pa.Table | pa.RecordBatch") -> "pa.Table": + """Replace semantic-typed columns with their content-hash bytes.""" + new_columns: list[pa.Array] = [] + new_fields: list[pa.Field] = [] + + for i, field in enumerate(table.schema): + # Short-circuit: columns that cannot contain semantic types skip + # the costly Python round-trip. Extension types must pass through + # so visit_extension can process them. + if not ( + isinstance(field.type, pa.ExtensionType) + or pa.types.is_struct(field.type) + or pa.types.is_list(field.type) + or pa.types.is_large_list(field.type) + or pa.types.is_fixed_size_list(field.type) + or pa.types.is_map(field.type) + ): + new_columns.append(table.column(i)) + new_fields.append(field) + continue + + column_data = table.column(i).to_pylist() + visitor = SemanticHashingVisitor(self._type_converter, self._semantic_hasher) + + try: + new_type: pa.DataType | None = None + processed_data: list[Any] = [] + for value in column_data: + processed_type, processed_value = visitor.visit(field.type, value) + if new_type is None and processed_value is not None: + new_type = processed_type + processed_data.append(processed_value) + + if new_type is None: + new_type = field.type + new_columns.append(pa.array(processed_data, type=new_type)) + new_fields.append(field.with_type(new_type)) + + except Exception as exc: + raise RuntimeError( + f"Failed to process column '{field.name}': {exc}" + ) from exc + + return pa.table( + new_columns, + schema=pa.schema(new_fields, metadata=table.schema.metadata), + ) + + def hash_schema(self, schema: "pa.Schema") -> ContentHash: + """Hash an Arrow schema using the starfix canonical algorithm.""" + include_meta = has_extension_metadata(schema) + if include_meta: + schema = clean_schema_for_hashing(schema) + digest = ArrowDigester.hash_schema(schema, include_metadata=include_meta) + return ContentHash(method=self._hasher_id, digest=digest) + + def hash_table(self, table: "pa.Table | pa.RecordBatch") -> ContentHash: + """Hash an Arrow table (or ``RecordBatch``) using starfix.""" + if isinstance(table, pa.RecordBatch): + table = pa.Table.from_batches([table]) + + processed_table = self._process_table_columns(table) + include_meta = has_extension_metadata(processed_table.schema) + if include_meta: + clean_schema = clean_schema_for_hashing(processed_table.schema) + clean_table = pa.Table.from_arrays( + processed_table.columns, schema=clean_schema + ) + else: + clean_table = processed_table + digest = ArrowDigester.hash_table(clean_table, include_metadata=include_meta) + return ContentHash(method=self._hasher_id, digest=digest) +``` + +- [ ] **Step 2: Commit** + +```bash +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + add src/orcapod/hashing/arrow_hashers.py +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + commit -m "refactor(arrow_hashers): update StarfixArrowHasher for extension types, delete SemanticArrowHasher" +``` + +--- + +## Task 10: Update `test_starfix_arrow_hasher.py`, run tests + +**Files:** +- Modify: `tests/test_hashing/test_starfix_arrow_hasher.py` + +- [ ] **Step 1: Update `_make_hasher()` and remove `SemanticTypeRegistry` import** + +```python +# Remove this import: +# from orcapod.semantic_types import SemanticTypeRegistry + +# Update _make_hasher(): +def _make_hasher() -> StarfixArrowHasher: + from orcapod.contexts import get_default_context + ctx = get_default_context() + return StarfixArrowHasher( + type_converter=ctx.type_converter, + semantic_hasher=ctx.semantic_hasher, + hasher_id=HASHER_ID, + ) +``` + +- [ ] **Step 2: Run the hashing test suite** + +```bash +uv run --project /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + pytest tests/test_hashing/ -x -v +``` + +Expected: all tests pass (golden digests unchanged for plain-schema tables; extension type tests pass). + +- [ ] **Step 3: Commit** + +```bash +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + add tests/test_hashing/test_starfix_arrow_hasher.py +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + commit -m "test(starfix_arrow_hasher): update _make_hasher() for new constructor, remove SemanticTypeRegistry import" +``` + +--- + +## Task 11: Update `v0.1.json`, `context_schema.json`, and `versioned_hashers.py` + +**Files:** +- Modify: `src/orcapod/contexts/data/v0.1.json` +- Modify: `src/orcapod/contexts/data/schemas/context_schema.json` +- Modify: `src/orcapod/hashing/versioned_hashers.py` + +- [ ] **Step 1: Rewrite `v0.1.json`** + +Key design note: `arrow_hasher` now depends on `semantic_hasher`, and `semantic_hasher` depends on `python_type_semantic_hasher_registry`. To avoid a circular dependency, the `pa.Table`/`pa.RecordBatch` handler entries are **removed** from the registry's handlers list (those entries previously referenced `arrow_hasher`). The JSON construction order is: `file_hasher` → `type_converter` → `function_info_extractor` → `python_type_semantic_hasher_registry` → `semantic_hasher` → `arrow_hasher`. + +```json +{ + "context_key": "std:v0.1:default", + "version": "v0.1", + "description": "Initial stable release with extension type hashing support", + "file_hasher": { + "_class": "orcapod.hashing.file_hashers.BasicFileHasher", + "_config": { + "algorithm": "sha256" + } + }, + "type_converter": { + "_class": "orcapod.semantic_types.universal_converter.UniversalTypeConverter", + "_config": { + "logical_type_registry": { + "_class": "orcapod.extension_types.registry.LogicalTypeRegistry", + "_config": { + "logical_types": [ + { + "_class": "orcapod.extension_types.builtin_logical_types.LogicalPath", + "_config": {} + }, + { + "_class": "orcapod.extension_types.builtin_logical_types.LogicalUPath", + "_config": {} + }, + { + "_class": "orcapod.extension_types.builtin_logical_types.LogicalUUID", + "_config": {} + } + ], + "factories": [ + { + "factory": { + "_class": "orcapod.extension_types.dataclass_logical_type_factory.DataclassLogicalTypeFactory", + "_config": {} + }, + "category": "orcapod.dataclass", + "python_bases": [{"_type": "builtins.object"}] + }, + { + "factory": { + "_class": "orcapod.extension_types.pydantic_logical_type_factory.PydanticLogicalTypeFactory", + "_config": {} + }, + "category": "orcapod.pydantic", + "python_bases": [{"_type": "pydantic.BaseModel"}] + } + ] + } + } + } + }, + "function_info_extractor": { + "_class": "orcapod.hashing.semantic_hashing.function_info_extractors.FunctionSignatureExtractor", + "_config": { + "include_module": true, + "include_defaults": true + } + }, + "python_type_semantic_hasher_registry": { + "_class": "orcapod.hashing.semantic_hashing.type_handler_registry.PythonTypeSemanticHasherRegistry", + "_config": { + "handlers": [ + [{"_type": "builtins.bytes"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesSemanticHasher", "_config": {}}], + [{"_type": "builtins.bytearray"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesSemanticHasher", "_config": {}}], + [{"_type": "pathlib.Path"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.PathSemanticHasher", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], + [{"_type": "upath.core.UPath"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UPathSemanticHasher", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], + [{"_type": "uuid.UUID"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UUIDSemanticHasher", "_config": {}}], + [{"_type": "types.FunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionSemanticHasher", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], + [{"_type": "types.BuiltinFunctionType"},{"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionSemanticHasher", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], + [{"_type": "types.MethodType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionSemanticHasher", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], + [{"_type": "builtins.type"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.TypeObjectSemanticHasher", "_config": {}}], + [{"_type": "types.GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasSemanticHasher", "_config": {}}], + [{"_type": "types.UnionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UnionTypeSemanticHasher", "_config": {}}], + [{"_type": "typing._GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasSemanticHasher", "_config": {}}], + [{"_type": "typing._SpecialForm"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.SpecialFormSemanticHasher", "_config": {}}] + ] + } + }, + "semantic_hasher": { + "_class": "orcapod.hashing.semantic_hashing.semantic_hasher.SemanticAwarePythonHasher", + "_config": { + "hasher_id": "semantic_v0.1", + "type_semantic_hasher_registry": { + "_ref": "python_type_semantic_hasher_registry" + } + } + }, + "arrow_hasher": { + "_class": "orcapod.hashing.arrow_hashers.StarfixArrowHasher", + "_config": { + "hasher_id": "arrow_v0.1", + "type_converter": {"_ref": "type_converter"}, + "semantic_hasher": {"_ref": "semantic_hasher"} + } + }, + "metadata": { + "created_date": "2026-06-24", + "author": "OrcaPod Core Team", + "changelog": [ + "Initial release with Path semantic type support", + "Basic SHA-256 hashing for files and objects", + "Arrow logical serialization method", + "Introduced arrow_v0.1 StarfixArrowHasher using starfix ArrowDigester for cross-language-compatible Arrow hashing", + "Hard cut: replaced shape-based SemanticTypeRegistry with extension-type hashing; renamed all hashing classes to clearer names" + ] + } +} +``` + +- [ ] **Step 2: Update `context_schema.json`** + +Two changes: +1. Remove the `semantic_registry` property from `properties`. +2. Rename `type_handler_registry` → `python_type_semantic_hasher_registry` in `properties`. + +```json +"python_type_semantic_hasher_registry": { + "$ref": "#/$defs/objectspec", + "description": "ObjectSpec for the PythonTypeSemanticHasherRegistry used by the semantic hasher" +}, +``` + +Also update the `examples` section references and remove the `"semantic_registry"` entry. + +- [ ] **Step 3: Update `versioned_hashers.py`** + +```python +""" +Versioned hasher factories for OrcaPod. +""" + +from __future__ import annotations + +import logging +from typing import Any + +from orcapod.protocols import hashing_protocols as hp + +logger = logging.getLogger(__name__) + +_CURRENT_SEMANTIC_HASHER_ID = "semantic_v0.1" +_CURRENT_ARROW_HASHER_ID = "arrow_v0.1" + + +def get_versioned_semantic_hasher( + hasher_id: str = _CURRENT_SEMANTIC_HASHER_ID, + strict: bool = True, + type_semantic_hasher_registry: "Any | None" = None, +) -> hp.SemanticHasherProtocol: + """Return a SemanticAwarePythonHasher configured for the current version. + + Parameters + ---------- + hasher_id: + Identifier embedded in every ContentHash produced by this hasher. + strict: + When True raises TypeError for unhandled types. When False falls back + to a best-effort string representation. + type_semantic_hasher_registry: + Optional ``PythonTypeSemanticHasherRegistry`` to inject. When None the + global default registry is used. + """ + from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher + + if type_semantic_hasher_registry is None: + from orcapod.hashing.semantic_hashing.type_handler_registry import ( + get_default_python_type_semantic_hasher_registry, + ) + type_semantic_hasher_registry = get_default_python_type_semantic_hasher_registry() + + logger.debug( + "get_versioned_semantic_hasher: creating SemanticAwarePythonHasher " + "(hasher_id=%r, strict=%r)", + hasher_id, + strict, + ) + return SemanticAwarePythonHasher( + hasher_id=hasher_id, + type_semantic_hasher_registry=type_semantic_hasher_registry, + strict=strict, + ) + + +def get_versioned_semantic_arrow_hasher( + hasher_id: str = _CURRENT_ARROW_HASHER_ID, +) -> hp.ArrowHasherProtocol: + """Return a StarfixArrowHasher configured for the current version. + + Sources ``type_converter`` and ``semantic_hasher`` from the default + ``DataContext`` so that the arrow hasher is consistent with all other + versioned components. + + Parameters + ---------- + hasher_id: + Identifier embedded in every ContentHash produced by this hasher. + """ + from orcapod.hashing.arrow_hashers import StarfixArrowHasher + from orcapod.contexts import resolve_context + + ctx = resolve_context(None) # default context + logger.debug( + "get_versioned_semantic_arrow_hasher: creating StarfixArrowHasher " + "(hasher_id=%r)", + hasher_id, + ) + return StarfixArrowHasher( + hasher_id=hasher_id, + type_converter=ctx.type_converter, + semantic_hasher=ctx.semantic_hasher, + ) +``` + +- [ ] **Step 4: Run the full test suite (except test_semantic_types)** + +```bash +uv run --project /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + pytest tests/test_hashing/ tests/test_extension_types/ tests/test_core/ -x -v +``` + +Expected: all tests pass. + +- [ ] **Step 5: Commit** + +```bash +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + add src/orcapod/contexts/data/v0.1.json \ + src/orcapod/contexts/data/schemas/context_schema.json \ + src/orcapod/hashing/versioned_hashers.py +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + commit -m "feat(v0.1): wire extension type hashing into default context; remove semantic_registry" +``` + +--- + +## Task 12: Delete old semantic type system + grep sweep + final test run + +**Files:** +- Delete: `src/orcapod/semantic_types/semantic_struct_converters.py` +- Delete: `src/orcapod/semantic_types/semantic_registry.py` +- Delete: `tests/test_semantic_types/` (all 9 files) +- Delete: `tests/test_hashing/test_file_hashing_consistency.py` +- Modify: `src/orcapod/semantic_types/__init__.py` +- Modify: `src/orcapod/protocols/semantic_types_protocols.py` + +- [ ] **Step 1: Delete old source files** + +```bash +rm /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python/src/orcapod/semantic_types/semantic_struct_converters.py +rm /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python/src/orcapod/semantic_types/semantic_registry.py +``` + +- [ ] **Step 2: Update `semantic_types/__init__.py`** — remove `SemanticTypeRegistry` export + +```python +from .universal_converter import UniversalTypeConverter +from .type_inference import infer_python_schema_from_pylist_data + +__all__ = [ + "UniversalTypeConverter", + "infer_python_schema_from_pylist_data", +] +``` + +- [ ] **Step 3: Remove `SemanticStructConverterProtocol` from `semantic_types_protocols.py`** + +Delete the `SemanticStructConverterProtocol` class and any imports that only support it. Keep `TypeConverterProtocol` and all other classes. + +- [ ] **Step 4: Delete old test files** + +```bash +rm /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python/tests/test_hashing/test_file_hashing_consistency.py +rm -r /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python/tests/test_semantic_types/ +``` + +- [ ] **Step 5: Grep sweep for stale references** + +```bash +grep -rn \ + "SemanticTypeRegistry\|semantic_registry\|SemanticStructConverter\ +\|BaseSemanticHasher\|TypeHandlerRegistry\|BuiltinTypeHandlerRegistry\ +\|TypeHandlerProtocol\|PathContentHandler\|UPathContentHandler\ +\|UUIDHandler\|BytesHandler\|FunctionHandler\|TypeObjectHandler\ +\|SpecialFormHandler\|GenericAliasHandler\|UnionTypeHandler\|ArrowTableHandler\ +\|SchemaHandler\|register_builtin_handlers\|get_default_type_handler_registry\ +\|type_handler_registry\|get_handler\b\|has_handler\b\|SemanticArrowHasher" \ + /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python/src/ \ + /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python/tests/ \ + 2>/dev/null +``` + +Expected: zero matches (fix any that appear before continuing). + +- [ ] **Step 6: Run full test suite** + +```bash +uv run --project /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + pytest tests/test_hashing/ tests/test_extension_types/ tests/test_core/ -x -v +``` + +Expected: all tests pass. + +- [ ] **Step 7: Final commit** + +```bash +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + add -u +git -C /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python \ + commit -m "feat(PLT-1660): hard cut — delete SemanticTypeRegistry and old struct-based hashing system" +``` + +--- + +## Self-Review + +**Spec coverage:** +- ✅ §1 `visit_extension` added to `ArrowTypeDataVisitor`, `visit()` updated (Task 8) +- ✅ §2 `SemanticHashingVisitor` rewritten with binary encoding (Task 8) +- ✅ §3 `StarfixArrowHasher` constructor updated + short-circuit + `SemanticArrowHasher` deleted (Task 9) +- ✅ §4 `SemanticArrowHasher` deleted (Task 9) +- ✅ §5 All class/method renames applied (Tasks 1–6) +- ✅ §6 Protocol tightened: `hash() -> ContentHash` (Tasks 1, 3, 4) +- ✅ §7 `v0.1.json` updated (Task 11) — note: `pa.Table`/`pa.RecordBatch` handlers removed to break circular dep +- ✅ §8 `context_schema.json` updated (Task 11) +- ✅ §9 `DataContext.core` docstring updated (Task 5) +- ✅ §10 `versioned_hashers.py` sources from context (Task 11) +- ✅ Files to delete: all covered (Task 12) +- ✅ Files to update: covered across Tasks 1–11 + +**Circular dependency note (§7 deviation):** The spec says to add `"semantic_hasher": {"_ref": "semantic_hasher"}` to `arrow_hasher._config`. This is correct and implemented. However, to avoid a construction-order cycle (`arrow_hasher` → `semantic_hasher` → `registry` → `arrow_hasher` via `ArrowTableSemanticHasher`), the `pa.Table` and `pa.RecordBatch` handler entries are removed from the `python_type_semantic_hasher_registry` handlers list in `v0.1.json`. These handlers depended on `arrow_hasher` creating the cycle. The `register_builtin_python_type_semantic_hashers()` function still supports them when `arrow_hasher` is passed explicitly (e.g., for custom registry construction in tests). + +**Type consistency check:** +- `SemanticAwarePythonHasher.__init__` takes `type_semantic_hasher_registry` → `v0.1.json` uses key `type_semantic_hasher_registry` ✅ +- `SemanticHashingVisitor.__init__` takes `type_converter, python_hasher` → `_process_table_columns` passes `self._type_converter, self._semantic_hasher` ✅ +- `StarfixArrowHasher.__init__` takes `type_converter, semantic_hasher, hasher_id` → `versioned_hashers.py` passes these by keyword ✅ +- `PythonTypeSemanticHasherRegistry.get_semantic_hasher(obj)` → `SemanticAwarePythonHasher.hash_object()` calls this ✅ +- `PythonTypeSemanticHasherRegistry.has_semantic_hasher(target_type)` → `SemanticHashingVisitor.visit_extension()` calls this ✅ From 14478b353b7a26d8c7922768189cca1c841b75a5 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 20:47:31 +0000 Subject: [PATCH 22/33] fix(test-objective): update test_hashing.py for renamed hashing classes --- test-objective/unit/test_hashing.py | 196 ++++++++++++++-------------- 1 file changed, 98 insertions(+), 98 deletions(-) diff --git a/test-objective/unit/test_hashing.py b/test-objective/unit/test_hashing.py index c2083c21..5dd04c8c 100644 --- a/test-objective/unit/test_hashing.py +++ b/test-objective/unit/test_hashing.py @@ -1,4 +1,4 @@ -"""Tests for BaseSemanticHasher and TypeHandlerRegistry. +"""Tests for SemanticAwarePythonHasher and PythonTypeSemanticHasherRegistry. Specification-derived tests covering deterministic hashing of primitives, structures, ContentHash pass-through, identity_structure resolution, @@ -13,10 +13,10 @@ import pytest -from orcapod.hashing.semantic_hashing.semantic_hasher import BaseSemanticHasher +from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher from orcapod.hashing.semantic_hashing.type_handler_registry import ( - BuiltinTypeHandlerRegistry, - TypeHandlerRegistry, + BuiltinPythonTypeSemanticHasherRegistry, + PythonTypeSemanticHasherRegistry, ) from orcapod.types import ContentHash @@ -27,27 +27,27 @@ @pytest.fixture -def registry() -> TypeHandlerRegistry: - """An empty TypeHandlerRegistry.""" - return TypeHandlerRegistry() +def registry() -> PythonTypeSemanticHasherRegistry: + """An empty PythonTypeSemanticHasherRegistry.""" + return PythonTypeSemanticHasherRegistry() @pytest.fixture -def hasher(registry: TypeHandlerRegistry) -> BaseSemanticHasher: - """A strict BaseSemanticHasher backed by an empty registry.""" - return BaseSemanticHasher( +def hasher(registry: PythonTypeSemanticHasherRegistry) -> SemanticAwarePythonHasher: + """A strict SemanticAwarePythonHasher backed by an empty registry.""" + return SemanticAwarePythonHasher( hasher_id="test_v1", - type_handler_registry=registry, + type_semantic_hasher_registry=registry, strict=True, ) @pytest.fixture -def lenient_hasher(registry: TypeHandlerRegistry) -> BaseSemanticHasher: - """A non-strict BaseSemanticHasher backed by an empty registry.""" - return BaseSemanticHasher( +def lenient_hasher(registry: PythonTypeSemanticHasherRegistry) -> SemanticAwarePythonHasher: + """A non-strict SemanticAwarePythonHasher backed by an empty registry.""" + return SemanticAwarePythonHasher( hasher_id="test_v1", - type_handler_registry=registry, + type_semantic_hasher_registry=registry, strict=False, ) @@ -58,13 +58,13 @@ def lenient_hasher(registry: TypeHandlerRegistry) -> BaseSemanticHasher: class _FakeHandler: - """Minimal object satisfying TypeHandlerProtocol for testing.""" + """Minimal object satisfying PythonTypeSemanticHasherProtocol for testing.""" def __init__(self, return_value: Any = "handled") -> None: self._return_value = return_value - def handle(self, obj: Any, hasher: BaseSemanticHasher) -> Any: - return self._return_value + def hash(self, obj: Any, hasher: SemanticAwarePythonHasher) -> ContentHash: + return ContentHash(method="fake", digest=str(self._return_value).encode()) class _IdentityObj: @@ -79,18 +79,18 @@ def identity_structure(self) -> Any: def content_hash(self, hasher: Any = None) -> ContentHash: if hasher is not None: return hasher.hash_object(self.identity_structure()) - h = BaseSemanticHasher( - "test_v1", type_handler_registry=TypeHandlerRegistry(), strict=False + h = SemanticAwarePythonHasher( + "test_v1", type_semantic_hasher_registry=PythonTypeSemanticHasherRegistry(), strict=False ) return h.hash_object(self.identity_structure()) # =================================================================== -# BaseSemanticHasher -- primitive hashing +# SemanticAwarePythonHasher -- primitive hashing # =================================================================== -class TestBaseSemanticHasherPrimitives: +class TestSemanticAwarePythonHasherPrimitives: """Primitives (int, str, float, bool, None) are hashed deterministically.""" @pytest.mark.parametrize( @@ -99,21 +99,21 @@ class TestBaseSemanticHasherPrimitives: ids=lambda v: f"{type(v).__name__}({v!r})", ) def test_primitive_produces_content_hash( - self, hasher: BaseSemanticHasher, value: Any + self, hasher: SemanticAwarePythonHasher, value: Any ) -> None: result = hasher.hash_object(value) assert isinstance(result, ContentHash) @pytest.mark.parametrize("value", [42, "hello", 3.14, True, None]) def test_primitive_deterministic( - self, hasher: BaseSemanticHasher, value: Any + self, hasher: SemanticAwarePythonHasher, value: Any ) -> None: """Same input always produces the same hash.""" h1 = hasher.hash_object(value) h2 = hasher.hash_object(value) assert h1 == h2 - def test_different_primitives_differ(self, hasher: BaseSemanticHasher) -> None: + def test_different_primitives_differ(self, hasher: SemanticAwarePythonHasher) -> None: """Different inputs produce different hashes (collision resistance).""" h_int = hasher.hash_object(42) h_str = hasher.hash_object("42") @@ -121,48 +121,48 @@ def test_different_primitives_differ(self, hasher: BaseSemanticHasher) -> None: # =================================================================== -# BaseSemanticHasher -- structures +# SemanticAwarePythonHasher -- structures # =================================================================== -class TestBaseSemanticHasherStructures: +class TestSemanticAwarePythonHasherStructures: """Structures (list, dict, tuple, set) are expanded and hashed.""" - def test_list_hashed(self, hasher: BaseSemanticHasher) -> None: + def test_list_hashed(self, hasher: SemanticAwarePythonHasher) -> None: result = hasher.hash_object([1, 2, 3]) assert isinstance(result, ContentHash) - def test_dict_hashed(self, hasher: BaseSemanticHasher) -> None: + def test_dict_hashed(self, hasher: SemanticAwarePythonHasher) -> None: result = hasher.hash_object({"a": 1, "b": 2}) assert isinstance(result, ContentHash) - def test_tuple_hashed(self, hasher: BaseSemanticHasher) -> None: + def test_tuple_hashed(self, hasher: SemanticAwarePythonHasher) -> None: result = hasher.hash_object((1, 2, 3)) assert isinstance(result, ContentHash) - def test_set_hashed(self, hasher: BaseSemanticHasher) -> None: + def test_set_hashed(self, hasher: SemanticAwarePythonHasher) -> None: result = hasher.hash_object({1, 2, 3}) assert isinstance(result, ContentHash) - def test_list_and_tuple_differ(self, hasher: BaseSemanticHasher) -> None: + def test_list_and_tuple_differ(self, hasher: SemanticAwarePythonHasher) -> None: """list and tuple with same elements produce different hashes.""" h_list = hasher.hash_object([1, 2, 3]) h_tuple = hasher.hash_object((1, 2, 3)) assert h_list != h_tuple - def test_set_order_independent(self, hasher: BaseSemanticHasher) -> None: + def test_set_order_independent(self, hasher: SemanticAwarePythonHasher) -> None: """Sets with the same elements hash identically regardless of insertion order.""" h1 = hasher.hash_object({3, 1, 2}) h2 = hasher.hash_object({1, 2, 3}) assert h1 == h2 - def test_dict_key_order_independent(self, hasher: BaseSemanticHasher) -> None: + def test_dict_key_order_independent(self, hasher: SemanticAwarePythonHasher) -> None: """Dicts with the same key-value pairs hash identically regardless of order.""" h1 = hasher.hash_object({"b": 2, "a": 1}) h2 = hasher.hash_object({"a": 1, "b": 2}) assert h1 == h2 - def test_nested_structures(self, hasher: BaseSemanticHasher) -> None: + def test_nested_structures(self, hasher: SemanticAwarePythonHasher) -> None: """Nested structures are hashed correctly.""" nested = {"key": [1, (2, 3)], "other": {"inner": True}} result = hasher.hash_object(nested) @@ -170,48 +170,48 @@ def test_nested_structures(self, hasher: BaseSemanticHasher) -> None: # Determinism assert result == hasher.hash_object(nested) - def test_different_structures_differ(self, hasher: BaseSemanticHasher) -> None: + def test_different_structures_differ(self, hasher: SemanticAwarePythonHasher) -> None: h1 = hasher.hash_object([1, 2]) h2 = hasher.hash_object([1, 2, 3]) assert h1 != h2 # =================================================================== -# BaseSemanticHasher -- ContentHash passthrough +# SemanticAwarePythonHasher -- ContentHash passthrough # =================================================================== -class TestBaseSemanticHasherContentHash: +class TestSemanticAwarePythonHasherContentHash: """ContentHash inputs are returned as-is (terminal).""" - def test_content_hash_passthrough(self, hasher: BaseSemanticHasher) -> None: + def test_content_hash_passthrough(self, hasher: SemanticAwarePythonHasher) -> None: ch = ContentHash(method="sha256", digest=b"\x00" * 32) result = hasher.hash_object(ch) assert result is ch # =================================================================== -# BaseSemanticHasher -- identity_structure resolution +# SemanticAwarePythonHasher -- identity_structure resolution # =================================================================== -class TestBaseSemanticHasherIdentityStructure: +class TestSemanticAwarePythonHasherIdentityStructure: """Objects implementing identity_structure() are resolved via it.""" - def test_identity_structure_object(self, hasher: BaseSemanticHasher) -> None: + def test_identity_structure_object(self, hasher: SemanticAwarePythonHasher) -> None: obj = _IdentityObj(structure={"name": "test", "version": 1}) result = hasher.hash_object(obj) assert isinstance(result, ContentHash) def test_identity_structure_deterministic( - self, hasher: BaseSemanticHasher + self, hasher: SemanticAwarePythonHasher ) -> None: obj1 = _IdentityObj(structure=[1, 2, 3]) obj2 = _IdentityObj(structure=[1, 2, 3]) assert hasher.hash_object(obj1) == hasher.hash_object(obj2) def test_different_identity_structures_differ( - self, hasher: BaseSemanticHasher + self, hasher: SemanticAwarePythonHasher ) -> None: obj1 = _IdentityObj(structure="alpha") obj2 = _IdentityObj(structure="beta") @@ -219,22 +219,22 @@ def test_different_identity_structures_differ( # =================================================================== -# BaseSemanticHasher -- strict mode +# SemanticAwarePythonHasher -- strict mode # =================================================================== -class TestBaseSemanticHasherStrictMode: +class TestSemanticAwarePythonHasherStrictMode: """Unknown type in strict mode raises TypeError.""" - def test_unknown_type_strict_raises(self, hasher: BaseSemanticHasher) -> None: + def test_unknown_type_strict_raises(self, hasher: SemanticAwarePythonHasher) -> None: class Unknown: pass - with pytest.raises(TypeError, match="no TypeHandlerProtocol registered"): + with pytest.raises(TypeError, match="no PythonTypeSemanticHasherProtocol registered"): hasher.hash_object(Unknown()) def test_unknown_type_lenient_succeeds( - self, lenient_hasher: BaseSemanticHasher + self, lenient_hasher: SemanticAwarePythonHasher ) -> None: class Unknown: pass @@ -244,26 +244,26 @@ class Unknown: # =================================================================== -# BaseSemanticHasher -- collision resistance +# SemanticAwarePythonHasher -- collision resistance # =================================================================== -class TestBaseSemanticHasherCollisionResistance: +class TestSemanticAwarePythonHasherCollisionResistance: """Different inputs produce different hashes.""" - def test_int_vs_string(self, hasher: BaseSemanticHasher) -> None: + def test_int_vs_string(self, hasher: SemanticAwarePythonHasher) -> None: assert hasher.hash_object(1) != hasher.hash_object("1") - def test_empty_list_vs_empty_tuple(self, hasher: BaseSemanticHasher) -> None: + def test_empty_list_vs_empty_tuple(self, hasher: SemanticAwarePythonHasher) -> None: assert hasher.hash_object([]) != hasher.hash_object(()) - def test_empty_dict_vs_empty_list(self, hasher: BaseSemanticHasher) -> None: + def test_empty_dict_vs_empty_list(self, hasher: SemanticAwarePythonHasher) -> None: assert hasher.hash_object({}) != hasher.hash_object([]) - def test_none_vs_string_none(self, hasher: BaseSemanticHasher) -> None: + def test_none_vs_string_none(self, hasher: SemanticAwarePythonHasher) -> None: assert hasher.hash_object(None) != hasher.hash_object("None") - def test_true_vs_one(self, hasher: BaseSemanticHasher) -> None: + def test_true_vs_one(self, hasher: SemanticAwarePythonHasher) -> None: """bool True and int 1 produce different hashes due to JSON encoding.""" h_true = hasher.hash_object(True) h_one = hasher.hash_object(1) @@ -271,34 +271,34 @@ def test_true_vs_one(self, hasher: BaseSemanticHasher) -> None: # =================================================================== -# TypeHandlerRegistry -- register/get_handler roundtrip +# PythonTypeSemanticHasherRegistry -- register/get_semantic_hasher roundtrip # =================================================================== -class TestTypeHandlerRegistryBasics: - """register() + get_handler() roundtrip.""" +class TestPythonTypeSemanticHasherRegistryBasics: + """register() + get_semantic_hasher() roundtrip.""" - def test_register_and_get_handler(self, registry: TypeHandlerRegistry) -> None: + def test_register_and_get_semantic_hasher(self, registry: PythonTypeSemanticHasherRegistry) -> None: handler = _FakeHandler() registry.register(int, handler) - assert registry.get_handler(42) is handler + assert registry.get_semantic_hasher(42) is handler - def test_get_handler_returns_none_for_unregistered( - self, registry: TypeHandlerRegistry + def test_get_semantic_hasher_returns_none_for_unregistered( + self, registry: PythonTypeSemanticHasherRegistry ) -> None: - assert registry.get_handler("hello") is None + assert registry.get_semantic_hasher("hello") is None # =================================================================== -# TypeHandlerRegistry -- MRO-aware lookup +# PythonTypeSemanticHasherRegistry -- MRO-aware lookup # =================================================================== -class TestTypeHandlerRegistryMRO: +class TestPythonTypeSemanticHasherRegistryMRO: """MRO-aware lookup: handler for parent class matches subclass.""" def test_subclass_inherits_parent_handler( - self, registry: TypeHandlerRegistry + self, registry: PythonTypeSemanticHasherRegistry ) -> None: class Base: pass @@ -308,10 +308,10 @@ class Child(Base): handler = _FakeHandler() registry.register(Base, handler) - assert registry.get_handler(Child()) is handler + assert registry.get_semantic_hasher(Child()) is handler def test_specific_handler_overrides_parent( - self, registry: TypeHandlerRegistry + self, registry: PythonTypeSemanticHasherRegistry ) -> None: class Base: pass @@ -323,46 +323,46 @@ class Child(Base): child_handler = _FakeHandler("child") registry.register(Base, parent_handler) registry.register(Child, child_handler) - assert registry.get_handler(Child()) is child_handler - assert registry.get_handler(Base()) is parent_handler + assert registry.get_semantic_hasher(Child()) is child_handler + assert registry.get_semantic_hasher(Base()) is parent_handler # =================================================================== -# TypeHandlerRegistry -- unregister +# PythonTypeSemanticHasherRegistry -- unregister # =================================================================== -class TestTypeHandlerRegistryUnregister: +class TestPythonTypeSemanticHasherRegistryUnregister: """unregister() removes handler.""" - def test_unregister_existing(self, registry: TypeHandlerRegistry) -> None: + def test_unregister_existing(self, registry: PythonTypeSemanticHasherRegistry) -> None: handler = _FakeHandler() registry.register(int, handler) result = registry.unregister(int) assert result is True - assert registry.get_handler(42) is None + assert registry.get_semantic_hasher(42) is None - def test_unregister_nonexistent(self, registry: TypeHandlerRegistry) -> None: + def test_unregister_nonexistent(self, registry: PythonTypeSemanticHasherRegistry) -> None: result = registry.unregister(float) assert result is False # =================================================================== -# TypeHandlerRegistry -- has_handler +# PythonTypeSemanticHasherRegistry -- has_semantic_hasher # =================================================================== -class TestTypeHandlerRegistryHasHandler: - """has_handler() boolean check.""" +class TestPythonTypeSemanticHasherRegistryHasSemanticHasher: + """has_semantic_hasher() boolean check.""" - def test_has_handler_true(self, registry: TypeHandlerRegistry) -> None: + def test_has_semantic_hasher_true(self, registry: PythonTypeSemanticHasherRegistry) -> None: registry.register(int, _FakeHandler()) - assert registry.has_handler(int) is True + assert registry.has_semantic_hasher(int) is True - def test_has_handler_false(self, registry: TypeHandlerRegistry) -> None: - assert registry.has_handler(str) is False + def test_has_semantic_hasher_false(self, registry: PythonTypeSemanticHasherRegistry) -> None: + assert registry.has_semantic_hasher(str) is False - def test_has_handler_via_mro(self, registry: TypeHandlerRegistry) -> None: + def test_has_semantic_hasher_via_mro(self, registry: PythonTypeSemanticHasherRegistry) -> None: class Base: pass @@ -370,21 +370,21 @@ class Child(Base): pass registry.register(Base, _FakeHandler()) - assert registry.has_handler(Child) is True + assert registry.has_semantic_hasher(Child) is True # =================================================================== -# TypeHandlerRegistry -- registered_types +# PythonTypeSemanticHasherRegistry -- registered_types # =================================================================== -class TestTypeHandlerRegistryRegisteredTypes: +class TestPythonTypeSemanticHasherRegistryRegisteredTypes: """registered_types() lists types.""" - def test_registered_types_empty(self, registry: TypeHandlerRegistry) -> None: + def test_registered_types_empty(self, registry: PythonTypeSemanticHasherRegistry) -> None: assert registry.registered_types() == [] - def test_registered_types_populated(self, registry: TypeHandlerRegistry) -> None: + def test_registered_types_populated(self, registry: PythonTypeSemanticHasherRegistry) -> None: registry.register(int, _FakeHandler()) registry.register(str, _FakeHandler()) types = registry.registered_types() @@ -392,14 +392,14 @@ def test_registered_types_populated(self, registry: TypeHandlerRegistry) -> None # =================================================================== -# TypeHandlerRegistry -- thread safety +# PythonTypeSemanticHasherRegistry -- thread safety # =================================================================== -class TestTypeHandlerRegistryThreadSafety: +class TestPythonTypeSemanticHasherRegistryThreadSafety: """Concurrent register/lookup doesn't crash.""" - def test_concurrent_register_lookup(self, registry: TypeHandlerRegistry) -> None: + def test_concurrent_register_lookup(self, registry: PythonTypeSemanticHasherRegistry) -> None: errors: list[Exception] = [] def register_types(start: int, count: int) -> None: @@ -413,9 +413,9 @@ def register_types(start: int, count: int) -> None: def lookup_types() -> None: try: for _ in range(100): - registry.get_handler(42) + registry.get_semantic_hasher(42) registry.registered_types() - registry.has_handler(int) + registry.has_semantic_hasher(int) except Exception as exc: errors.append(exc) @@ -435,13 +435,13 @@ def lookup_types() -> None: # =================================================================== -# BuiltinTypeHandlerRegistry +# BuiltinPythonTypeSemanticHasherRegistry # =================================================================== -class TestBuiltinTypeHandlerRegistry: - """BuiltinTypeHandlerRegistry is pre-populated with built-in handlers.""" +class TestBuiltinPythonTypeSemanticHasherRegistry: + """BuiltinPythonTypeSemanticHasherRegistry is pre-populated with built-in handlers.""" def test_construction(self) -> None: - reg = BuiltinTypeHandlerRegistry() + reg = BuiltinPythonTypeSemanticHasherRegistry() assert len(reg.registered_types()) > 0 From d29079c7406dc2abf90ff038e0f1648bd13518f0 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 21:48:56 +0000 Subject: [PATCH 23/33] =?UTF-8?q?fix(PLT-1660):=20address=20Copilot=20revi?= =?UTF-8?q?ew=20=E2=80=94=20utf-8=20encoding,=20return=20type=20annotation?= =?UTF-8?q?s,=20list=20element=20type=20inference,=20always=20register=20A?= =?UTF-8?q?rrowTableSemanticHasher?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../semantic_hashing/builtin_handlers.py | 20 +++++++++---------- src/orcapod/hashing/visitors.py | 10 +++++----- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py index fd5cef22..ca567f76 100644 --- a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py +++ b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py @@ -237,10 +237,11 @@ def register_builtin_python_type_semantic_hashers( ) -> None: """Register all built-in semantic hashers into *registry*. - When ``arrow_hasher`` is None, ``pa.Table`` and ``pa.RecordBatch`` handlers - are **not** registered (to avoid circular dependency in the JSON context - construction — the default context's ``python_type_semantic_hasher_registry`` - is built before ``arrow_hasher``). + ``pa.Table`` and ``pa.RecordBatch`` are always registered via + ``ArrowTableSemanticHasher``. When ``arrow_hasher`` is provided it is + passed through for immediate use; when ``None``, ``ArrowTableSemanticHasher`` + resolves the active arrow hasher lazily via ``get_default_context()`` at + hash time, breaking the construction-time circular dependency. Args: registry: The ``PythonTypeSemanticHasherRegistry`` to populate. @@ -249,7 +250,7 @@ def register_builtin_python_type_semantic_hashers( function_info_extractor: Optional ``FunctionInfoExtractorProtocol``. Defaults to ``FunctionSignatureExtractor``. arrow_hasher: Optional ``ArrowHasherProtocol`` for nested table hashing. - When None, Arrow table handlers are skipped. + When ``None``, lazy resolution via the default context is used. """ if file_hasher is None: from orcapod.hashing.file_hashers import BasicFileHasher @@ -293,11 +294,10 @@ def register_builtin_python_type_semantic_hashers( registry.register(Schema, SchemaSemanticHasher()) - if arrow_hasher is not None: - import pyarrow as _pa - arrow_table_hasher = ArrowTableSemanticHasher(arrow_hasher) - registry.register(_pa.Table, arrow_table_hasher) - registry.register(_pa.RecordBatch, arrow_table_hasher) + import pyarrow as _pa + arrow_table_hasher = ArrowTableSemanticHasher(arrow_hasher) + registry.register(_pa.Table, arrow_table_hasher) + registry.register(_pa.RecordBatch, arrow_table_hasher) logger.debug( "register_builtin_python_type_semantic_hashers: registered %d hashers", diff --git a/src/orcapod/hashing/visitors.py b/src/orcapod/hashing/visitors.py index 72015ebf..ee0da7d5 100644 --- a/src/orcapod/hashing/visitors.py +++ b/src/orcapod/hashing/visitors.py @@ -108,7 +108,7 @@ def visit(self, arrow_type: "pa.DataType", data: Any) -> tuple["pa.DataType", An def _visit_struct_fields( self, struct_type: "pa.StructType", data: dict | None - ) -> tuple["pa.StructType", dict]: + ) -> tuple["pa.StructType", dict | None]: """Recursively process struct fields. Default behavior for regular structs.""" if data is None: return struct_type, None @@ -126,7 +126,7 @@ def _visit_struct_fields( def _visit_list_elements( self, list_type: "pa.ListType", data: list | None - ) -> tuple["pa.DataType", list]: + ) -> tuple["pa.DataType", list | None]: """Recursively process list elements.""" if data is None: return list_type, None @@ -138,7 +138,7 @@ def _visit_list_elements( for item in data: current_element_type, processed_item = self.visit(element_type, item) processed_elements.append(processed_item) - if new_element_type is None: + if new_element_type is None and processed_item is not None: new_element_type = current_element_type if new_element_type is None: @@ -224,7 +224,7 @@ def visit_extension( # The "::" separator is unambiguous because to_prefixed_digest() uses only ":". type_name = extension_type.extension_name.replace(".", ":") hash_bytes = ( - type_name.encode("ascii") + type_name.encode("utf-8") + b"::" + content_hash.to_prefixed_digest() ) @@ -264,7 +264,7 @@ def visit_primitive( def _visit_struct_fields( self, struct_type: "pa.StructType", data: dict | None - ) -> tuple["pa.StructType", dict]: + ) -> tuple["pa.StructType", dict | None]: """Override to add field path tracking for better error messages.""" if data is None: return struct_type, None From 28a0987bbbda4b76f22cc9d29a8ea12abbf8cc23 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 22:18:40 +0000 Subject: [PATCH 24/33] refactor(hashing): revert PythonTypeSemanticHasherProtocol.hash() to return Any Handlers now return a representative Python structure instead of a ContentHash. SemanticAwarePythonHasher.hash_object() feeds the result back into hash_object() for final hashing, treating a returned ContentHash as a terminal (no re-hashing). Simple built-in handlers (UUID, Bytes, Function, TypeObject, SpecialForm, GenericAlias, UnionType) are simplified to return plain Python values/structures. Semantic handlers that compute content-based hashes from external data (Path, UPath, ArrowTable) continue to return ContentHash directly, which short-circuits hashing as before. Hash values are preserved: the extra hash_object() call is a no-op for the simple handlers since the structure they return is identical to what they previously delegated to hash_object() internally. Co-Authored-By: Claude Sonnet 4.6 --- .../semantic_hashing/builtin_handlers.py | 40 +++++++++---------- .../semantic_hashing/semantic_hasher.py | 14 +++++-- src/orcapod/protocols/hashing_protocols.py | 22 +++++++--- test-objective/unit/test_hashing.py | 4 +- tests/test_hashing/test_semantic_hasher.py | 8 ++-- 5 files changed, 52 insertions(+), 36 deletions(-) diff --git a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py index ca567f76..35f5a935 100644 --- a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py +++ b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py @@ -95,18 +95,18 @@ def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: class UUIDSemanticHasher: - """Hasher for ``uuid.UUID`` objects — hashes the raw 16-byte binary representation.""" + """Hasher for ``uuid.UUID`` objects — returns the raw 16-byte binary representation.""" - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: - return hasher.hash_object(obj.bytes) + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + return obj.bytes class BytesSemanticHasher: - """Hasher for bytes and bytearray objects — hashes the lowercase hex representation.""" + """Hasher for bytes and bytearray objects — returns the lowercase hex string.""" - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: if isinstance(obj, (bytes, bytearray)): - return hasher.hash_object(obj.hex()) + return obj.hex() raise TypeError( f"BytesSemanticHasher: expected bytes or bytearray, got {type(obj)!r}" ) @@ -123,7 +123,7 @@ class FunctionSemanticHasher: def __init__(self, function_info_extractor: Any) -> None: self.function_info_extractor = function_info_extractor - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: if not (callable(obj) and hasattr(obj, "__code__")): raise TypeError( f"FunctionSemanticHasher: expected a callable with __code__, got {type(obj)!r}" @@ -131,7 +131,7 @@ def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: func_name = getattr(obj, "__name__", repr(obj)) logger.debug("FunctionSemanticHasher: extracting info for function %r", func_name) info: dict[str, Any] = self.function_info_extractor.extract_function_info(obj) - return hasher.hash_object(info) + return info class TypeObjectSemanticHasher: @@ -140,51 +140,51 @@ class TypeObjectSemanticHasher: Returns a stable string of the form ``"type:."``. """ - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: if not isinstance(obj, type): raise TypeError( f"TypeObjectSemanticHasher: expected a type/class, got {type(obj)!r}" ) module: str = obj.__module__ or "" qualname: str = obj.__qualname__ - return hasher.hash_object(f"type:{module}.{qualname}") + return f"type:{module}.{qualname}" class SpecialFormSemanticHasher: """Hasher for ``typing._SpecialForm`` objects such as ``typing.Union``.""" - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: name = getattr(obj, "_name", None) or repr(obj) - return hasher.hash_object(f"special_form:typing.{name}") + return f"special_form:typing.{name}" class GenericAliasSemanticHasher: """Hasher for generic alias type annotations (``dict[int, str]``, ``Optional[X]``, etc.).""" - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: import typing origin = getattr(obj, "__origin__", None) args = getattr(obj, "__args__", None) or () if origin is None: - return hasher.hash_object(f"generic_alias:{obj!r}") + return f"generic_alias:{obj!r}" if origin is typing.Union: hashed_args = sorted(hasher.hash_object(arg).to_string() for arg in args) - return hasher.hash_object({"__type__": "union", "args": hashed_args}) - return hasher.hash_object({ + return {"__type__": "union", "args": hashed_args} + return { "__type__": "generic_alias", "origin": hasher.hash_object(origin).to_string(), "args": [hasher.hash_object(arg).to_string() for arg in args], - }) + } class UnionTypeSemanticHasher: """Hasher for ``types.UnionType`` objects (Python 3.10+ ``X | Y`` syntax).""" - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: args = getattr(obj, "__args__", None) or () hashed_args = sorted(hasher.hash_object(arg).to_string() for arg in args) - return hasher.hash_object({"__type__": "union", "args": hashed_args}) + return {"__type__": "union", "args": hashed_args} class ArrowTableSemanticHasher: @@ -221,7 +221,7 @@ def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: class SchemaSemanticHasher: """Hasher for ``Schema`` objects.""" - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: if not isinstance(obj, Schema): raise TypeError( f"SchemaSemanticHasher: expected a Schema, got {type(obj)!r}" diff --git a/src/orcapod/hashing/semantic_hashing/semantic_hasher.py b/src/orcapod/hashing/semantic_hashing/semantic_hasher.py index 300f6987..fbf5abb1 100644 --- a/src/orcapod/hashing/semantic_hashing/semantic_hasher.py +++ b/src/orcapod/hashing/semantic_hashing/semantic_hasher.py @@ -13,7 +13,9 @@ - Primitive → JSON-serialise + SHA-256 - Structure → delegate to ``_expand_structure``, then JSON-serialise the resulting tagged tree + SHA-256 - - Semantic hasher match → semantic_hasher.hash(obj, self) returns ContentHash directly + - Semantic hasher match → semantic_hasher.hash(obj, self) returns a representative + Python structure (or ContentHash as terminal); the result + is fed back into hash_object for final hashing - ContentIdentifiableProtocol→ call identity_structure(), recurse via hash_object - Fallback → strict error or best-effort string, then hash @@ -143,7 +145,8 @@ def hash_object( - ContentHash → terminal; returned as-is - Primitive → JSON-serialised and hashed directly - Structure → structurally expanded then hashed - - Semantic hasher match → semantic_hasher.hash(obj, self) returns ContentHash directly + - Semantic hasher match → handler.hash(obj, self) returns a representative Python + structure (or ContentHash); result is fed back into hash_object for final hashing - ContentIdentifiableProtocol→ resolver(obj) if resolver provided, else obj.content_hash() - Unknown type → TypeError in strict mode; best-effort otherwise @@ -174,7 +177,9 @@ def hash_object( ) return self._hash_to_content_hash(expanded) - # Semantic hasher dispatch: the hasher produces a ContentHash directly. + # Semantic hasher dispatch: handler returns a representative Python structure + # (or a ContentHash as terminal); feed the result back into hash_object so + # that returning a plain structure is equivalent to calling hash_object on it. semantic_hasher = self._registry.get_semantic_hasher(obj) if semantic_hasher is not None: logger.debug( @@ -182,7 +187,8 @@ def hash_object( type(obj).__name__, type(semantic_hasher).__name__, ) - return semantic_hasher.hash(obj, self) + result = semantic_hasher.hash(obj, self) + return self.hash_object(result, resolver=resolver) # ContentIdentifiableProtocol: use resolver if provided, else content_hash(). if isinstance(obj, hp.ContentIdentifiableProtocol): diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index e60d9c12..6a260d30 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -52,8 +52,9 @@ def content_hash(self, hasher: "SemanticHasherProtocol | None" = None) -> Conten class PythonTypeSemanticHasherProtocol(Protocol): """Protocol for type-specific semantic hashers used by SemanticAwarePythonHasher. - A ``PythonTypeSemanticHasherProtocol`` hashes a specific Python type to a - ``ContentHash``. Implementations are registered with a + A ``PythonTypeSemanticHasherProtocol`` converts a specific Python type into a + representative Python structure that ``SemanticAwarePythonHasher.hash_object()`` + can then hash. Implementations are registered with a ``PythonTypeSemanticHasherRegistry`` and looked up via MRO-aware resolution. Each implementation receives the full ``SemanticAwarePythonHasher`` so it can @@ -61,16 +62,25 @@ class PythonTypeSemanticHasherProtocol(Protocol): specific hasher instance. """ - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: - """Hash *obj* to a ContentHash. + def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + """Return a representative Python structure for *obj*. + + The returned value is passed back into + ``SemanticAwarePythonHasher.hash_object()`` for final hashing. Returning + a ``ContentHash`` short-circuits the process: the caller returns it as-is + without re-hashing. This is useful for handlers that compute content-based + hashes from external data (e.g. file content, Arrow tables). Args: obj: The object to hash. Always matches the registered type. hasher: The active ``SemanticAwarePythonHasher``. Use - ``hasher.hash_object(sub_value)`` to hash sub-values. + ``hasher.hash_object(sub_value)`` to hash sub-values that + require type-specific treatment. Returns: - ContentHash: The content-addressed hash of *obj*. + A representative Python structure (primitive, dict, list, bytes, etc.) + that will be passed into ``hash_object()`` for final hashing, or a + ``ContentHash`` to terminate hashing immediately. """ ... diff --git a/test-objective/unit/test_hashing.py b/test-objective/unit/test_hashing.py index 5dd04c8c..a72e2810 100644 --- a/test-objective/unit/test_hashing.py +++ b/test-objective/unit/test_hashing.py @@ -63,8 +63,8 @@ class _FakeHandler: def __init__(self, return_value: Any = "handled") -> None: self._return_value = return_value - def hash(self, obj: Any, hasher: SemanticAwarePythonHasher) -> ContentHash: - return ContentHash(method="fake", digest=str(self._return_value).encode()) + def hash(self, obj: Any, hasher: SemanticAwarePythonHasher) -> Any: + return str(self._return_value) class _IdentityObj: diff --git a/tests/test_hashing/test_semantic_hasher.py b/tests/test_hashing/test_semantic_hasher.py index c6584155..f2c9e84c 100644 --- a/tests/test_hashing/test_semantic_hasher.py +++ b/tests/test_hashing/test_semantic_hasher.py @@ -831,8 +831,8 @@ def __init__(self, tag: str) -> None: self.tag = tag def hash(self, obj: Any, hasher: Any) -> Any: - # Returns a ContentHash by delegating to the outer hasher - return hasher.hash_object(f"{self.tag}:{obj}") + # Returns a representative Python structure; outer hasher performs final hashing + return f"{self.tag}:{obj}" class Base: @@ -953,8 +953,8 @@ def __init__(self, degrees: float) -> None: class CelsiusHandler: - def hash(self, obj: Any, hasher: Any) -> ContentHash: - return hasher.hash_object({"__type__": "Celsius", "degrees": obj.degrees}) + def hash(self, obj: Any, hasher: Any) -> Any: + return {"__type__": "Celsius", "degrees": obj.degrees} class TestCustomHandlerRegistration: From a79641f2af8a217eec36fa3cb53a00d7f9cf992d Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 22:23:19 +0000 Subject: [PATCH 25/33] =?UTF-8?q?refactor(hashing):=20rename=20PythonTypeS?= =?UTF-8?q?emanticHasherProtocol=20=E2=86=92=20PythonTypeHandler,=20hash()?= =?UTF-8?q?=20=E2=86=92=20handle()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The protocol is now called PythonTypeHandler with a handle() method, more clearly reflecting its role as a type-specific handler that returns a representative Python structure rather than computing a ContentHash directly. All built-in handlers, the registry, the dispatch in SemanticAwarePythonHasher, and all test helpers are updated accordingly. Co-Authored-By: Claude Sonnet 4.6 --- src/orcapod/hashing/__init__.py | 6 ++--- .../hashing/semantic_hashing/__init__.py | 4 +-- .../semantic_hashing/builtin_handlers.py | 24 +++++++++--------- .../semantic_hashing/semantic_hasher.py | 16 ++++++------ .../semantic_hashing/type_handler_registry.py | 25 +++++++++++-------- src/orcapod/protocols/hashing_protocols.py | 6 ++--- test-objective/unit/test_hashing.py | 6 ++--- tests/test_hashing/test_semantic_hasher.py | 12 ++++----- 8 files changed, 51 insertions(+), 48 deletions(-) diff --git a/src/orcapod/hashing/__init__.py b/src/orcapod/hashing/__init__.py index ceb0b059..805028ae 100644 --- a/src/orcapod/hashing/__init__.py +++ b/src/orcapod/hashing/__init__.py @@ -5,7 +5,7 @@ ---------- SemanticAwarePythonHasher -- content-based recursive object hasher SemanticHasherProtocol -- protocol for semantic hashers - PythonTypeSemanticHasherRegistry -- registry mapping types to PythonTypeSemanticHasherProtocol instances + PythonTypeSemanticHasherRegistry -- registry mapping types to PythonTypeHandler instances get_default_semantic_hasher -- global default SemanticHasherProtocol factory get_default_python_type_semantic_hasher_registry -- global default registry factory ContentIdentifiableMixin -- convenience mixin for content-identifiable objects @@ -53,7 +53,7 @@ ContentIdentifiableProtocol, FileContentHasherProtocol, FunctionInfoExtractorProtocol, - PythonTypeSemanticHasherProtocol, + PythonTypeHandler, SemanticHasherProtocol, SemanticTypeHasherProtocol, StringCacherProtocol, @@ -97,7 +97,7 @@ "register_builtin_python_type_semantic_hashers", "SemanticHasherProtocol", "ContentIdentifiableProtocol", - "PythonTypeSemanticHasherProtocol", + "PythonTypeHandler", "FileContentHasherProtocol", "ArrowHasherProtocol", "StringCacherProtocol", diff --git a/src/orcapod/hashing/semantic_hashing/__init__.py b/src/orcapod/hashing/semantic_hashing/__init__.py index db0eb765..84781a32 100644 --- a/src/orcapod/hashing/semantic_hashing/__init__.py +++ b/src/orcapod/hashing/semantic_hashing/__init__.py @@ -2,11 +2,11 @@ orcapod.hashing.semantic_hashing ================================= SemanticAwarePythonHasher -- content-based recursive object hasher - PythonTypeSemanticHasherRegistry -- MRO-aware registry mapping types → PythonTypeSemanticHasherProtocol + PythonTypeSemanticHasherRegistry -- MRO-aware registry mapping types → PythonTypeHandler BuiltinPythonTypeSemanticHasherRegistry -- pre-populated registry with built-in hashers ContentIdentifiableMixin -- convenience mixin for content-identifiable objects -Built-in PythonTypeSemanticHasherProtocol implementations: +Built-in PythonTypeHandler implementations: PathSemanticHasher -- pathlib.Path → file-content hash UUIDSemanticHasher -- uuid.UUID → canonical bytes BytesSemanticHasher -- bytes/bytearray → hex string diff --git a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py index 35f5a935..371950db 100644 --- a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py +++ b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py @@ -1,5 +1,5 @@ """ -Built-in PythonTypeSemanticHasherProtocol implementations. +Built-in PythonTypeHandler implementations. PathSemanticHasher -- pathlib.Path: file content hash UPathSemanticHasher -- upath.UPath: file content hash (remote-aware) @@ -51,7 +51,7 @@ class PathSemanticHasher: def __init__(self, file_hasher: "FileContentHasherProtocol") -> None: self.file_hasher = file_hasher - def hash(self, obj: PathLike, hasher: "SemanticAwarePythonHasher") -> ContentHash: + def handle(self, obj: PathLike, hasher: "SemanticAwarePythonHasher") -> ContentHash: path: Path = Path(obj) if not path.exists(): raise FileNotFoundError( @@ -77,7 +77,7 @@ class UPathSemanticHasher: def __init__(self, file_hasher: "FileContentHasherProtocol") -> None: self.file_hasher = file_hasher - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: if not isinstance(obj, UPath): raise TypeError( f"UPathSemanticHasher: expected a UPath, got {type(obj)!r}." @@ -97,14 +97,14 @@ def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: class UUIDSemanticHasher: """Hasher for ``uuid.UUID`` objects — returns the raw 16-byte binary representation.""" - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: return obj.bytes class BytesSemanticHasher: """Hasher for bytes and bytearray objects — returns the lowercase hex string.""" - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: if isinstance(obj, (bytes, bytearray)): return obj.hex() raise TypeError( @@ -123,7 +123,7 @@ class FunctionSemanticHasher: def __init__(self, function_info_extractor: Any) -> None: self.function_info_extractor = function_info_extractor - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: if not (callable(obj) and hasattr(obj, "__code__")): raise TypeError( f"FunctionSemanticHasher: expected a callable with __code__, got {type(obj)!r}" @@ -140,7 +140,7 @@ class TypeObjectSemanticHasher: Returns a stable string of the form ``"type:."``. """ - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: if not isinstance(obj, type): raise TypeError( f"TypeObjectSemanticHasher: expected a type/class, got {type(obj)!r}" @@ -153,7 +153,7 @@ def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: class SpecialFormSemanticHasher: """Hasher for ``typing._SpecialForm`` objects such as ``typing.Union``.""" - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: name = getattr(obj, "_name", None) or repr(obj) return f"special_form:typing.{name}" @@ -161,7 +161,7 @@ def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: class GenericAliasSemanticHasher: """Hasher for generic alias type annotations (``dict[int, str]``, ``Optional[X]``, etc.).""" - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: import typing origin = getattr(obj, "__origin__", None) @@ -181,7 +181,7 @@ def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: class UnionTypeSemanticHasher: """Hasher for ``types.UnionType`` objects (Python 3.10+ ``X | Y`` syntax).""" - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: args = getattr(obj, "__args__", None) or () hashed_args = sorted(hasher.hash_object(arg).to_string() for arg in args) return {"__type__": "union", "args": hashed_args} @@ -206,7 +206,7 @@ def _get_arrow_hasher(self) -> "ArrowHasherProtocol": from orcapod.contexts import get_default_context return get_default_context().arrow_hasher # type: ignore[return-value] - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: import pyarrow as _pa if isinstance(obj, _pa.RecordBatch): @@ -221,7 +221,7 @@ def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: class SchemaSemanticHasher: """Hasher for ``Schema`` objects.""" - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: if not isinstance(obj, Schema): raise TypeError( f"SchemaSemanticHasher: expected a Schema, got {type(obj)!r}" diff --git a/src/orcapod/hashing/semantic_hashing/semantic_hasher.py b/src/orcapod/hashing/semantic_hashing/semantic_hasher.py index fbf5abb1..24211741 100644 --- a/src/orcapod/hashing/semantic_hashing/semantic_hasher.py +++ b/src/orcapod/hashing/semantic_hashing/semantic_hasher.py @@ -13,7 +13,7 @@ - Primitive → JSON-serialise + SHA-256 - Structure → delegate to ``_expand_structure``, then JSON-serialise the resulting tagged tree + SHA-256 - - Semantic hasher match → semantic_hasher.hash(obj, self) returns a representative + - Semantic hasher match → handler.handle(obj, self) returns a representative Python structure (or ContentHash as terminal); the result is fed back into hash_object for final hashing - ContentIdentifiableProtocol→ call identity_structure(), recurse via hash_object @@ -92,7 +92,7 @@ class SemanticAwarePythonHasher: Embedded in every ContentHash produced. type_semantic_hasher_registry: ``PythonTypeSemanticHasherRegistry`` for MRO-aware lookup of - ``PythonTypeSemanticHasherProtocol`` instances. + ``PythonTypeHandler`` instances. If None, the default registry is used. strict: When True (default) raises TypeError for unhandled types. @@ -145,7 +145,7 @@ def hash_object( - ContentHash → terminal; returned as-is - Primitive → JSON-serialised and hashed directly - Structure → structurally expanded then hashed - - Semantic hasher match → handler.hash(obj, self) returns a representative Python + - Semantic hasher match → handler.handle(obj, self) returns a representative Python structure (or ContentHash); result is fed back into hash_object for final hashing - ContentIdentifiableProtocol→ resolver(obj) if resolver provided, else obj.content_hash() - Unknown type → TypeError in strict mode; best-effort otherwise @@ -187,7 +187,7 @@ def hash_object( type(obj).__name__, type(semantic_hasher).__name__, ) - result = semantic_hasher.hash(obj, self) + result = semantic_hasher.handle(obj, self) return self.hash_object(result, resolver=resolver) # ContentIdentifiableProtocol: use resolver if provided, else content_hash(). @@ -366,7 +366,7 @@ def _hash_to_content_hash(self, obj: Any) -> ContentHash: except (TypeError, ValueError) as exc: raise TypeError( f"SemanticAwarePythonHasher: failed to JSON-serialise object of type " - f"{type(obj).__name__!r}. Ensure all PythonTypeSemanticHasherProtocol " + f"{type(obj).__name__!r}. Ensure all PythonTypeHandler " "implementations and identity_structure() return JSON-serialisable " "primitives or structures." ) from exc @@ -389,15 +389,15 @@ def _handle_unknown(self, obj: Any) -> str: if self._strict: raise TypeError( - f"SemanticAwarePythonHasher (strict): no PythonTypeSemanticHasherProtocol " + f"SemanticAwarePythonHasher (strict): no PythonTypeHandler " f"registered for type '{qualified}' and it does not implement " - "ContentIdentifiableProtocol. Register a PythonTypeSemanticHasherProtocol " + "ContentIdentifiableProtocol. Register a PythonTypeHandler " "via the PythonTypeSemanticHasherRegistry or implement " "identity_structure() on the class." ) logger.warning( - "SemanticAwarePythonHasher (non-strict): no PythonTypeSemanticHasherProtocol registered for type '%s'. " + "SemanticAwarePythonHasher (non-strict): no PythonTypeHandler registered for type '%s'. " "Falling back to best-effort string representation.", qualified, ) diff --git a/src/orcapod/hashing/semantic_hashing/type_handler_registry.py b/src/orcapod/hashing/semantic_hashing/type_handler_registry.py index ebae2cb5..decfaf0c 100644 --- a/src/orcapod/hashing/semantic_hashing/type_handler_registry.py +++ b/src/orcapod/hashing/semantic_hashing/type_handler_registry.py @@ -1,5 +1,8 @@ """ -PythonTypeSemanticHasherRegistry — MRO-aware registry for PythonTypeSemanticHasherProtocol instances. +PythonTypeSemanticHasherRegistry — MRO-aware registry for PythonTypeHandler instances. + +``PythonTypeHandler`` is the protocol for type-specific handlers; this registry +provides MRO-aware lookup so subclasses inherit their parent's handler. """ from __future__ import annotations @@ -11,14 +14,14 @@ if TYPE_CHECKING: from orcapod.protocols.hashing_protocols import ( ArrowHasherProtocol, - PythonTypeSemanticHasherProtocol, + PythonTypeHandler, ) logger = logging.getLogger(__name__) class PythonTypeSemanticHasherRegistry: - """Registry mapping Python types to PythonTypeSemanticHasherProtocol instances. + """Registry mapping Python types to PythonTypeHandler instances. Lookup is MRO-aware: when no hasher is registered for the exact type of an object, the registry walks the object's MRO (most-derived first) until @@ -31,20 +34,20 @@ class PythonTypeSemanticHasherRegistry: """ def __init__( - self, handlers: list[tuple[type, "PythonTypeSemanticHasherProtocol"]] | None = None + self, handlers: list[tuple[type, "PythonTypeHandler"]] | None = None ) -> None: """ Args: handlers: Optional list of ``(target_type, hasher)`` pairs to register at construction time. """ - self._handlers: dict[type, "PythonTypeSemanticHasherProtocol"] = {} + self._handlers: dict[type, "PythonTypeHandler"] = {} self._lock = threading.RLock() if handlers: for target_type, handler in handlers: self.register(target_type, handler) - def register(self, target_type: type, handler: "PythonTypeSemanticHasherProtocol") -> None: + def register(self, target_type: type, handler: "PythonTypeHandler") -> None: """Register a hasher for a specific Python type. If a hasher is already registered for *target_type*, it is silently @@ -52,7 +55,7 @@ def register(self, target_type: type, handler: "PythonTypeSemanticHasherProtocol Args: target_type: The Python type (or class) for which the hasher should be used. - handler: A ``PythonTypeSemanticHasherProtocol`` instance. + handler: A ``PythonTypeHandler`` instance. Raises: TypeError: If ``target_type`` is not a ``type``. @@ -87,14 +90,14 @@ def unregister(self, target_type: type) -> bool: return True return False - def get_semantic_hasher(self, obj: Any) -> "PythonTypeSemanticHasherProtocol | None": + def get_semantic_hasher(self, obj: Any) -> "PythonTypeHandler | None": """Look up the hasher for *obj* using MRO-aware resolution. Args: obj: The object for which a hasher is needed. Returns: - The registered ``PythonTypeSemanticHasherProtocol``, or None. + The registered ``PythonTypeHandler``, or None. """ obj_type = type(obj) with self._lock: @@ -114,14 +117,14 @@ def get_semantic_hasher(self, obj: Any) -> "PythonTypeSemanticHasherProtocol | N def get_semantic_hasher_for_type( self, target_type: type - ) -> "PythonTypeSemanticHasherProtocol | None": + ) -> "PythonTypeHandler | None": """Look up the hasher for a *type object* (rather than an instance). Args: target_type: The type to look up. Returns: - The registered ``PythonTypeSemanticHasherProtocol``, or None. + The registered ``PythonTypeHandler``, or None. """ with self._lock: handler = self._handlers.get(target_type) diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index 6a260d30..d2a2f890 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -49,10 +49,10 @@ def content_hash(self, hasher: "SemanticHasherProtocol | None" = None) -> Conten ... -class PythonTypeSemanticHasherProtocol(Protocol): +class PythonTypeHandler(Protocol): """Protocol for type-specific semantic hashers used by SemanticAwarePythonHasher. - A ``PythonTypeSemanticHasherProtocol`` converts a specific Python type into a + A ``PythonTypeHandler`` converts a specific Python type into a representative Python structure that ``SemanticAwarePythonHasher.hash_object()`` can then hash. Implementations are registered with a ``PythonTypeSemanticHasherRegistry`` and looked up via MRO-aware resolution. @@ -62,7 +62,7 @@ class PythonTypeSemanticHasherProtocol(Protocol): specific hasher instance. """ - def hash(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: """Return a representative Python structure for *obj*. The returned value is passed back into diff --git a/test-objective/unit/test_hashing.py b/test-objective/unit/test_hashing.py index a72e2810..695c01cd 100644 --- a/test-objective/unit/test_hashing.py +++ b/test-objective/unit/test_hashing.py @@ -58,12 +58,12 @@ def lenient_hasher(registry: PythonTypeSemanticHasherRegistry) -> SemanticAwareP class _FakeHandler: - """Minimal object satisfying PythonTypeSemanticHasherProtocol for testing.""" + """Minimal object satisfying PythonTypeHandler for testing.""" def __init__(self, return_value: Any = "handled") -> None: self._return_value = return_value - def hash(self, obj: Any, hasher: SemanticAwarePythonHasher) -> Any: + def handle(self, obj: Any, hasher: SemanticAwarePythonHasher) -> Any: return str(self._return_value) @@ -230,7 +230,7 @@ def test_unknown_type_strict_raises(self, hasher: SemanticAwarePythonHasher) -> class Unknown: pass - with pytest.raises(TypeError, match="no PythonTypeSemanticHasherProtocol registered"): + with pytest.raises(TypeError, match="no PythonTypeHandler registered"): hasher.hash_object(Unknown()) def test_unknown_type_lenient_succeeds( diff --git a/tests/test_hashing/test_semantic_hasher.py b/tests/test_hashing/test_semantic_hasher.py index f2c9e84c..5b3b04a2 100644 --- a/tests/test_hashing/test_semantic_hasher.py +++ b/tests/test_hashing/test_semantic_hasher.py @@ -299,7 +299,7 @@ def __init__(self, x: int) -> None: class TestStrictMode: def test_strict_raises_on_unknown_type(self, hasher): - with pytest.raises(TypeError, match="no PythonTypeSemanticHasherProtocol registered"): + with pytest.raises(TypeError, match="no PythonTypeHandler registered"): hasher.hash_object(Unhandled(1)) def test_non_strict_returns_content_hash(self, lenient_hasher): @@ -830,7 +830,7 @@ class _DummySemanticHasher: def __init__(self, tag: str) -> None: self.tag = tag - def hash(self, obj: Any, hasher: Any) -> Any: + def handle(self, obj: Any, hasher: Any) -> Any: # Returns a representative Python structure; outer hasher performs final hashing return f"{self.tag}:{obj}" @@ -953,7 +953,7 @@ def __init__(self, degrees: float) -> None: class CelsiusHandler: - def hash(self, obj: Any, hasher: Any) -> Any: + def handle(self, obj: Any, hasher: Any) -> Any: return {"__type__": "Celsius", "degrees": obj.degrees} @@ -1008,7 +1008,7 @@ def test_handler_returning_content_hash_is_terminal(self): """A handler that returns a ContentHash must not be re-hashed.""" class DirectHashHandler: - def hash(self, obj: Any, hasher: Any) -> ContentHash: + def handle(self, obj: Any, hasher: Any) -> ContentHash: return ContentHash("direct", b"\xaa" * 32) registry = PythonTypeSemanticHasherRegistry() @@ -1042,8 +1042,8 @@ def __init__(self, k: float) -> None: self.k = k class KelvinHandler: - def hash(self, obj: Any, hasher: Any) -> ContentHash: - return hasher.hash_object({"__type__": "Kelvin", "k": obj.k}) + def handle(self, obj: Any, hasher: Any) -> Any: + return {"__type__": "Kelvin", "k": obj.k} global_registry = get_default_python_type_semantic_hasher_registry() global_registry.register(Kelvin, KelvinHandler()) From 764a1bf70525f8ff7dfd1d383629d4c89e60d060 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 23:16:26 +0000 Subject: [PATCH 26/33] =?UTF-8?q?refactor(hashing):=20rename=20*SemanticHa?= =?UTF-8?q?sher=20=E2=86=92=20*Handler,=20PythonTypeSemanticHasherRegistry?= =?UTF-8?q?=20=E2=86=92=20PythonTypeHandlerRegistry?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mechanical rename across all source files, JSON configs, and tests: - PathSemanticHasher → PathHandler, UPathSemanticHasher → UPathHandler, UUIDSemanticHasher → UUIDHandler, BytesSemanticHasher → BytesHandler, FunctionSemanticHasher → FunctionHandler, TypeObjectSemanticHasher → TypeObjectHandler, SpecialFormSemanticHasher → SpecialFormHandler, GenericAliasSemanticHasher → GenericAliasHandler, UnionTypeSemanticHasher → UnionTypeHandler, ArrowTableSemanticHasher → ArrowTableHandler, SchemaSemanticHasher → SchemaHandler - register_builtin_python_type_semantic_hashers → register_builtin_python_type_handlers - PythonTypeSemanticHasherRegistry → PythonTypeHandlerRegistry - BuiltinPythonTypeSemanticHasherRegistry → BuiltinPythonTypeHandlerRegistry - get_default_python_type_semantic_hasher_registry → get_default_python_type_handler_registry - type_semantic_hasher_registry param/property → type_handler_registry - JSON config keys and _class values updated accordingly No logic changes. All 3717 tests pass. --- src/orcapod/contexts/core.py | 4 +- .../contexts/data/schemas/context_schema.json | 10 +- src/orcapod/contexts/data/v0.1.json | 38 +- src/orcapod/core/datagrams/datagram.py | 4 +- src/orcapod/hashing/__init__.py | 52 +-- src/orcapod/hashing/defaults.py | 10 +- .../hashing/semantic_hashing/__init__.py | 50 +-- .../semantic_hashing/builtin_handlers.py | 112 ++--- .../semantic_hashing/semantic_hasher.py | 22 +- .../semantic_hashing/type_handler_registry.py | 24 +- src/orcapod/hashing/versioned_hashers.py | 14 +- src/orcapod/hashing/visitors.py | 2 +- src/orcapod/protocols/hashing_protocols.py | 8 +- ...06-24-rename-semantic-hasher-to-handler.md | 422 ++++++++++++++++++ test-objective/unit/test_hashing.py | 78 ++-- .../test_extension_type_hashing.py | 6 +- tests/test_hashing/test_semantic_hasher.py | 96 ++-- tests/test_hashing/test_uuid_handler.py | 16 +- 18 files changed, 695 insertions(+), 273 deletions(-) create mode 100644 superpowers/plans/2026-06-24-rename-semantic-hasher-to-handler.md diff --git a/src/orcapod/contexts/core.py b/src/orcapod/contexts/core.py index 6b4aa2ca..d84ae67f 100644 --- a/src/orcapod/contexts/core.py +++ b/src/orcapod/contexts/core.py @@ -21,8 +21,8 @@ class DataContext: registration. This is the single public API for all type operations. arrow_hasher: Arrow table hasher for this context semantic_hasher: General semantic hasher for this context. The - ``PythonTypeSemanticHasherRegistry`` used for hashing is accessible via - ``semantic_hasher.type_semantic_hasher_registry``. + ``PythonTypeHandlerRegistry`` used for hashing is accessible via + ``semantic_hasher.type_handler_registry``. """ context_key: str diff --git a/src/orcapod/contexts/data/schemas/context_schema.json b/src/orcapod/contexts/data/schemas/context_schema.json index 1a908dfc..73f07dd4 100644 --- a/src/orcapod/contexts/data/schemas/context_schema.json +++ b/src/orcapod/contexts/data/schemas/context_schema.json @@ -53,17 +53,17 @@ "$ref": "#/$defs/objectspec", "description": "ObjectSpec for the semantic hasher component" }, - "python_type_semantic_hasher_registry": { + "python_type_handler_registry": { "$ref": "#/$defs/objectspec", - "description": "ObjectSpec for the PythonTypeSemanticHasherRegistry used by the semantic hasher" + "description": "ObjectSpec for the PythonTypeHandlerRegistry used by the semantic hasher" }, "file_hasher": { "$ref": "#/$defs/objectspec", - "description": "ObjectSpec for the file content hasher (used by PathSemanticHasher)" + "description": "ObjectSpec for the file content hasher (used by PathHandler)" }, "function_info_extractor": { "$ref": "#/$defs/objectspec", - "description": "ObjectSpec for the function info extractor (used by FunctionSemanticHasher)" + "description": "ObjectSpec for the function info extractor (used by FunctionHandler)" }, "metadata": { "type": "object", @@ -180,7 +180,7 @@ "_class": "orcapod.hashing.semantic_hashing.semantic_hasher.SemanticAwarePythonHasher", "_config": { "hasher_id": "semantic_v0.1", - "type_semantic_hasher_registry": {"_ref": "python_type_semantic_hasher_registry"} + "type_handler_registry": {"_ref": "python_type_handler_registry"} } }, "metadata": { diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json index 447db766..07e8e686 100644 --- a/src/orcapod/contexts/data/v0.1.json +++ b/src/orcapod/contexts/data/v0.1.json @@ -57,25 +57,25 @@ "include_defaults": true } }, - "python_type_semantic_hasher_registry": { - "_class": "orcapod.hashing.semantic_hashing.type_handler_registry.PythonTypeSemanticHasherRegistry", + "python_type_handler_registry": { + "_class": "orcapod.hashing.semantic_hashing.type_handler_registry.PythonTypeHandlerRegistry", "_config": { "handlers": [ - [{"_type": "builtins.bytes"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesSemanticHasher", "_config": {}}], - [{"_type": "builtins.bytearray"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesSemanticHasher", "_config": {}}], - [{"_type": "pathlib.Path"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.PathSemanticHasher", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], - [{"_type": "upath.core.UPath"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UPathSemanticHasher", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], - [{"_type": "uuid.UUID"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UUIDSemanticHasher", "_config": {}}], - [{"_type": "types.FunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionSemanticHasher", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], - [{"_type": "types.BuiltinFunctionType"},{"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionSemanticHasher", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], - [{"_type": "types.MethodType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionSemanticHasher", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], - [{"_type": "builtins.type"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.TypeObjectSemanticHasher", "_config": {}}], - [{"_type": "types.GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasSemanticHasher", "_config": {}}], - [{"_type": "types.UnionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UnionTypeSemanticHasher", "_config": {}}], - [{"_type": "typing._GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasSemanticHasher", "_config": {}}], - [{"_type": "typing._SpecialForm"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.SpecialFormSemanticHasher", "_config": {}}], - [{"_type": "pyarrow.Table"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableSemanticHasher", "_config": {}}], - [{"_type": "pyarrow.RecordBatch"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableSemanticHasher", "_config": {}}] + [{"_type": "builtins.bytes"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesHandler", "_config": {}}], + [{"_type": "builtins.bytearray"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesHandler", "_config": {}}], + [{"_type": "pathlib.Path"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.PathHandler", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], + [{"_type": "upath.core.UPath"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UPathHandler", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], + [{"_type": "uuid.UUID"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UUIDHandler", "_config": {}}], + [{"_type": "types.FunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], + [{"_type": "types.BuiltinFunctionType"},{"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], + [{"_type": "types.MethodType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], + [{"_type": "builtins.type"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.TypeObjectHandler", "_config": {}}], + [{"_type": "types.GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasHandler", "_config": {}}], + [{"_type": "types.UnionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UnionTypeHandler", "_config": {}}], + [{"_type": "typing._GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasHandler", "_config": {}}], + [{"_type": "typing._SpecialForm"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.SpecialFormHandler", "_config": {}}], + [{"_type": "pyarrow.Table"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableHandler", "_config": {}}], + [{"_type": "pyarrow.RecordBatch"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableHandler", "_config": {}}] ] } }, @@ -83,8 +83,8 @@ "_class": "orcapod.hashing.semantic_hashing.semantic_hasher.SemanticAwarePythonHasher", "_config": { "hasher_id": "semantic_v0.1", - "type_semantic_hasher_registry": { - "_ref": "python_type_semantic_hasher_registry" + "type_handler_registry": { + "_ref": "python_type_handler_registry" } } }, diff --git a/src/orcapod/core/datagrams/datagram.py b/src/orcapod/core/datagrams/datagram.py index 5ebae203..8fa2b48b 100644 --- a/src/orcapod/core/datagrams/datagram.py +++ b/src/orcapod/core/datagrams/datagram.py @@ -12,7 +12,7 @@ - **Dict for value access**: ``__getitem__``, ``get``, ``as_dict()`` always operate through the Python dict (loaded lazily from Arrow when needed). - **Arrow for hashing**: ``content_hash()`` always uses the Arrow table (loaded lazily from - dict when needed) via the data context's ``ArrowTableSemanticHasher``. + dict when needed) via the data context's ``ArrowTableHandler``. - **Meta is always dict**: meta columns are stored as a Python dict regardless of how the primary data was provided; the Arrow meta table is built lazily. """ @@ -418,7 +418,7 @@ def arrow_schema( def identity_structure(self) -> Any: """Return the primary data table as this datagram's identity. - The semantic hasher dispatches ``pa.Table`` to ``ArrowTableSemanticHasher``, + The semantic hasher dispatches ``pa.Table`` to ``ArrowTableHandler``, which delegates to the data context's ``arrow_hasher``. This means ``content_hash()`` (inherited from ``ContentIdentifiableBase``) produces a stable, content-addressed hash of the data columns without any diff --git a/src/orcapod/hashing/__init__.py b/src/orcapod/hashing/__init__.py index 805028ae..658180a0 100644 --- a/src/orcapod/hashing/__init__.py +++ b/src/orcapod/hashing/__init__.py @@ -5,18 +5,18 @@ ---------- SemanticAwarePythonHasher -- content-based recursive object hasher SemanticHasherProtocol -- protocol for semantic hashers - PythonTypeSemanticHasherRegistry -- registry mapping types to PythonTypeHandler instances + PythonTypeHandlerRegistry -- registry mapping types to PythonTypeHandler instances get_default_semantic_hasher -- global default SemanticHasherProtocol factory - get_default_python_type_semantic_hasher_registry -- global default registry factory + get_default_python_type_handler_registry -- global default registry factory ContentIdentifiableMixin -- convenience mixin for content-identifiable objects Built-in hashers (importable for custom registry setup): - PathSemanticHasher - UUIDSemanticHasher - BytesSemanticHasher - FunctionSemanticHasher - TypeObjectSemanticHasher - register_builtin_python_type_semantic_hashers + PathHandler + UUIDHandler + BytesHandler + FunctionHandler + TypeObjectHandler + register_builtin_python_type_handlers Utility: FileContentHasherProtocol @@ -27,26 +27,26 @@ from orcapod.hashing.defaults import ( get_default_arrow_hasher, - get_default_python_type_semantic_hasher_registry, + get_default_python_type_handler_registry, get_default_semantic_hasher, ) from orcapod.hashing.file_hashers import BasicFileHasher, CachedFileHasher from orcapod.hashing.hash_utils import hash_file from orcapod.hashing.semantic_hashing.builtin_handlers import ( - BytesSemanticHasher, - FunctionSemanticHasher, - PathSemanticHasher, - TypeObjectSemanticHasher, - UUIDSemanticHasher, - register_builtin_python_type_semantic_hashers, + BytesHandler, + FunctionHandler, + PathHandler, + TypeObjectHandler, + UUIDHandler, + register_builtin_python_type_handlers, ) from orcapod.hashing.semantic_hashing.content_identifiable_mixin import ( ContentIdentifiableMixin, ) from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher from orcapod.hashing.semantic_hashing.type_handler_registry import ( - BuiltinPythonTypeSemanticHasherRegistry, - PythonTypeSemanticHasherRegistry, + BuiltinPythonTypeHandlerRegistry, + PythonTypeHandlerRegistry, ) from orcapod.protocols.hashing_protocols import ( ArrowHasherProtocol, @@ -84,17 +84,17 @@ __all__ = [ "SemanticAwarePythonHasher", - "PythonTypeSemanticHasherRegistry", - "BuiltinPythonTypeSemanticHasherRegistry", - "get_default_python_type_semantic_hasher_registry", + "PythonTypeHandlerRegistry", + "BuiltinPythonTypeHandlerRegistry", + "get_default_python_type_handler_registry", "get_default_semantic_hasher", "ContentIdentifiableMixin", - "PathSemanticHasher", - "UUIDSemanticHasher", - "BytesSemanticHasher", - "FunctionSemanticHasher", - "TypeObjectSemanticHasher", - "register_builtin_python_type_semantic_hashers", + "PathHandler", + "UUIDHandler", + "BytesHandler", + "FunctionHandler", + "TypeObjectHandler", + "register_builtin_python_type_handlers", "SemanticHasherProtocol", "ContentIdentifiableProtocol", "PythonTypeHandler", diff --git a/src/orcapod/hashing/defaults.py b/src/orcapod/hashing/defaults.py index 21034936..fb95675b 100644 --- a/src/orcapod/hashing/defaults.py +++ b/src/orcapod/hashing/defaults.py @@ -10,24 +10,24 @@ # from its JSON spec. Constructing them here would bypass versioning and # produce hashers that are decoupled from the active data context. -from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeSemanticHasherRegistry +from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeHandlerRegistry from orcapod.protocols import hashing_protocols as hp -def get_default_python_type_semantic_hasher_registry() -> PythonTypeSemanticHasherRegistry: +def get_default_python_type_handler_registry() -> PythonTypeHandlerRegistry: """ - Return the ``PythonTypeSemanticHasherRegistry`` from the default data context's + Return the ``PythonTypeHandlerRegistry`` from the default data context's semantic hasher. The registry is owned by the active ``SemanticAwarePythonHasher``, which is itself versioned inside the active ``DataContext``. Returns: - PythonTypeSemanticHasherRegistry: The type semantic hasher registry from the + PythonTypeHandlerRegistry: The type handler registry from the default data context. """ from orcapod.contexts import get_default_context - return get_default_context().semantic_hasher.type_semantic_hasher_registry + return get_default_context().semantic_hasher.type_handler_registry def get_default_semantic_hasher() -> hp.SemanticHasherProtocol: diff --git a/src/orcapod/hashing/semantic_hashing/__init__.py b/src/orcapod/hashing/semantic_hashing/__init__.py index 84781a32..67d4bd64 100644 --- a/src/orcapod/hashing/semantic_hashing/__init__.py +++ b/src/orcapod/hashing/semantic_hashing/__init__.py @@ -2,31 +2,31 @@ orcapod.hashing.semantic_hashing ================================= SemanticAwarePythonHasher -- content-based recursive object hasher - PythonTypeSemanticHasherRegistry -- MRO-aware registry mapping types → PythonTypeHandler - BuiltinPythonTypeSemanticHasherRegistry -- pre-populated registry with built-in hashers + PythonTypeHandlerRegistry -- MRO-aware registry mapping types → PythonTypeHandler + BuiltinPythonTypeHandlerRegistry -- pre-populated registry with built-in hashers ContentIdentifiableMixin -- convenience mixin for content-identifiable objects Built-in PythonTypeHandler implementations: - PathSemanticHasher -- pathlib.Path → file-content hash - UUIDSemanticHasher -- uuid.UUID → canonical bytes - BytesSemanticHasher -- bytes/bytearray → hex string - FunctionSemanticHasher -- callable → via FunctionInfoExtractorProtocol - TypeObjectSemanticHasher -- type objects → "type:." - register_builtin_python_type_semantic_hashers -- populate a registry with all of the above + PathHandler -- pathlib.Path → file-content hash + UUIDHandler -- uuid.UUID → canonical bytes + BytesHandler -- bytes/bytearray → hex string + FunctionHandler -- callable → via FunctionInfoExtractorProtocol + TypeObjectHandler -- type objects → "type:." + register_builtin_python_type_handlers -- populate a registry with all of the above -Function info extractors (used by FunctionSemanticHasher): +Function info extractors (used by FunctionHandler): FunctionNameExtractor FunctionSignatureExtractor FunctionInfoExtractorFactory """ from orcapod.hashing.semantic_hashing.builtin_handlers import ( - BytesSemanticHasher, - FunctionSemanticHasher, - PathSemanticHasher, - TypeObjectSemanticHasher, - UUIDSemanticHasher, - register_builtin_python_type_semantic_hashers, + BytesHandler, + FunctionHandler, + PathHandler, + TypeObjectHandler, + UUIDHandler, + register_builtin_python_type_handlers, ) from orcapod.hashing.semantic_hashing.content_identifiable_mixin import ( ContentIdentifiableMixin, @@ -38,21 +38,21 @@ ) from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher from orcapod.hashing.semantic_hashing.type_handler_registry import ( - BuiltinPythonTypeSemanticHasherRegistry, - PythonTypeSemanticHasherRegistry, + BuiltinPythonTypeHandlerRegistry, + PythonTypeHandlerRegistry, ) __all__ = [ "SemanticAwarePythonHasher", - "PythonTypeSemanticHasherRegistry", - "BuiltinPythonTypeSemanticHasherRegistry", + "PythonTypeHandlerRegistry", + "BuiltinPythonTypeHandlerRegistry", "ContentIdentifiableMixin", - "PathSemanticHasher", - "UUIDSemanticHasher", - "BytesSemanticHasher", - "FunctionSemanticHasher", - "TypeObjectSemanticHasher", - "register_builtin_python_type_semantic_hashers", + "PathHandler", + "UUIDHandler", + "BytesHandler", + "FunctionHandler", + "TypeObjectHandler", + "register_builtin_python_type_handlers", "FunctionNameExtractor", "FunctionSignatureExtractor", "FunctionInfoExtractorFactory", diff --git a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py index 371950db..469a1fe5 100644 --- a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py +++ b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py @@ -1,19 +1,19 @@ """ Built-in PythonTypeHandler implementations. - PathSemanticHasher -- pathlib.Path: file content hash - UPathSemanticHasher -- upath.UPath: file content hash (remote-aware) - UUIDSemanticHasher -- uuid.UUID: 16-byte binary representation - BytesSemanticHasher -- bytes/bytearray: hex string representation - FunctionSemanticHasher -- callable with __code__: via FunctionInfoExtractorProtocol - TypeObjectSemanticHasher -- type objects: stable "type:." string - SpecialFormSemanticHasher -- typing._SpecialForm - GenericAliasSemanticHasher -- generic alias type annotations - UnionTypeSemanticHasher -- types.UnionType (Python 3.10+ X | Y syntax) - ArrowTableSemanticHasher -- pa.Table / pa.RecordBatch - SchemaSemanticHasher -- Schema objects - -``register_builtin_python_type_semantic_hashers(registry)`` populates a registry + PathHandler -- pathlib.Path: file content hash + UPathHandler -- upath.UPath: file content hash (remote-aware) + UUIDHandler -- uuid.UUID: 16-byte binary representation + BytesHandler -- bytes/bytearray: hex string representation + FunctionHandler -- callable with __code__: via FunctionInfoExtractorProtocol + TypeObjectHandler -- type objects: stable "type:." string + SpecialFormHandler -- typing._SpecialForm + GenericAliasHandler -- generic alias type annotations + UnionTypeHandler -- types.UnionType (Python 3.10+ X | Y syntax) + ArrowTableHandler -- pa.Table / pa.RecordBatch + SchemaHandler -- Schema objects + +``register_builtin_python_type_handlers(registry)`` populates a registry with all of the above. """ @@ -30,7 +30,7 @@ if TYPE_CHECKING: from orcapod.hashing.semantic_hashing.type_handler_registry import ( - PythonTypeSemanticHasherRegistry, + PythonTypeHandlerRegistry, ) from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher from orcapod.protocols.hashing_protocols import ( @@ -41,7 +41,7 @@ logger = logging.getLogger(__name__) -class PathSemanticHasher: +class PathHandler: """Hasher for pathlib.Path objects — hashes file *content*. Args: @@ -55,19 +55,19 @@ def handle(self, obj: PathLike, hasher: "SemanticAwarePythonHasher") -> ContentH path: Path = Path(obj) if not path.exists(): raise FileNotFoundError( - f"PathSemanticHasher: path does not exist: {path!r}. " + f"PathHandler: path does not exist: {path!r}. " "Paths must refer to existing files for content-based hashing." ) if path.is_dir(): raise IsADirectoryError( - f"PathSemanticHasher: path is a directory: {path!r}. " + f"PathHandler: path is a directory: {path!r}. " "Only regular files are supported for content-based hashing." ) - logger.debug("PathSemanticHasher: hashing file content at %s", path) + logger.debug("PathHandler: hashing file content at %s", path) return self.file_hasher.hash_file(path) -class UPathSemanticHasher: +class UPathHandler: """Hasher for universal_pathlib.UPath objects — hashes file content. Args: @@ -80,39 +80,39 @@ def __init__(self, file_hasher: "FileContentHasherProtocol") -> None: def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: if not isinstance(obj, UPath): raise TypeError( - f"UPathSemanticHasher: expected a UPath, got {type(obj)!r}." + f"UPathHandler: expected a UPath, got {type(obj)!r}." ) if not obj.exists(): raise FileNotFoundError( - f"UPathSemanticHasher: path does not exist: {obj!r}." + f"UPathHandler: path does not exist: {obj!r}." ) if obj.is_dir(): raise IsADirectoryError( - f"UPathSemanticHasher: path is a directory: {obj!r}." + f"UPathHandler: path is a directory: {obj!r}." ) - logger.debug("UPathSemanticHasher: hashing file content at %s", obj) + logger.debug("UPathHandler: hashing file content at %s", obj) return self.file_hasher.hash_file(obj) -class UUIDSemanticHasher: +class UUIDHandler: """Hasher for ``uuid.UUID`` objects — returns the raw 16-byte binary representation.""" def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: return obj.bytes -class BytesSemanticHasher: +class BytesHandler: """Hasher for bytes and bytearray objects — returns the lowercase hex string.""" def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: if isinstance(obj, (bytes, bytearray)): return obj.hex() raise TypeError( - f"BytesSemanticHasher: expected bytes or bytearray, got {type(obj)!r}" + f"BytesHandler: expected bytes or bytearray, got {type(obj)!r}" ) -class FunctionSemanticHasher: +class FunctionHandler: """Hasher for Python functions/callables with a ``__code__`` attribute. Args: @@ -126,15 +126,15 @@ def __init__(self, function_info_extractor: Any) -> None: def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: if not (callable(obj) and hasattr(obj, "__code__")): raise TypeError( - f"FunctionSemanticHasher: expected a callable with __code__, got {type(obj)!r}" + f"FunctionHandler: expected a callable with __code__, got {type(obj)!r}" ) func_name = getattr(obj, "__name__", repr(obj)) - logger.debug("FunctionSemanticHasher: extracting info for function %r", func_name) + logger.debug("FunctionHandler: extracting info for function %r", func_name) info: dict[str, Any] = self.function_info_extractor.extract_function_info(obj) return info -class TypeObjectSemanticHasher: +class TypeObjectHandler: """Hasher for type objects (classes passed as values). Returns a stable string of the form ``"type:."``. @@ -143,14 +143,14 @@ class TypeObjectSemanticHasher: def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: if not isinstance(obj, type): raise TypeError( - f"TypeObjectSemanticHasher: expected a type/class, got {type(obj)!r}" + f"TypeObjectHandler: expected a type/class, got {type(obj)!r}" ) module: str = obj.__module__ or "" qualname: str = obj.__qualname__ return f"type:{module}.{qualname}" -class SpecialFormSemanticHasher: +class SpecialFormHandler: """Hasher for ``typing._SpecialForm`` objects such as ``typing.Union``.""" def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: @@ -158,7 +158,7 @@ def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: return f"special_form:typing.{name}" -class GenericAliasSemanticHasher: +class GenericAliasHandler: """Hasher for generic alias type annotations (``dict[int, str]``, ``Optional[X]``, etc.).""" def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: @@ -178,7 +178,7 @@ def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: } -class UnionTypeSemanticHasher: +class UnionTypeHandler: """Hasher for ``types.UnionType`` objects (Python 3.10+ ``X | Y`` syntax).""" def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: @@ -187,7 +187,7 @@ def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: return {"__type__": "union", "args": hashed_args} -class ArrowTableSemanticHasher: +class ArrowTableHandler: """Hasher for ``pa.Table`` and ``pa.RecordBatch`` objects. Args: @@ -213,24 +213,24 @@ def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: obj = _pa.Table.from_batches([obj]) if not isinstance(obj, _pa.Table): raise TypeError( - f"ArrowTableSemanticHasher: expected pa.Table or pa.RecordBatch, got {type(obj)!r}" + f"ArrowTableHandler: expected pa.Table or pa.RecordBatch, got {type(obj)!r}" ) return self._get_arrow_hasher().hash_table(obj) -class SchemaSemanticHasher: +class SchemaHandler: """Hasher for ``Schema`` objects.""" def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: if not isinstance(obj, Schema): raise TypeError( - f"SchemaSemanticHasher: expected a Schema, got {type(obj)!r}" + f"SchemaHandler: expected a Schema, got {type(obj)!r}" ) - raise NotImplementedError("SchemaSemanticHasher is not yet implemented.") + raise NotImplementedError("SchemaHandler is not yet implemented.") -def register_builtin_python_type_semantic_hashers( - registry: "PythonTypeSemanticHasherRegistry", +def register_builtin_python_type_handlers( + registry: "PythonTypeHandlerRegistry", file_hasher: Any = None, function_info_extractor: Any = None, arrow_hasher: "ArrowHasherProtocol | None" = None, @@ -238,13 +238,13 @@ def register_builtin_python_type_semantic_hashers( """Register all built-in semantic hashers into *registry*. ``pa.Table`` and ``pa.RecordBatch`` are always registered via - ``ArrowTableSemanticHasher``. When ``arrow_hasher`` is provided it is - passed through for immediate use; when ``None``, ``ArrowTableSemanticHasher`` + ``ArrowTableHandler``. When ``arrow_hasher`` is provided it is + passed through for immediate use; when ``None``, ``ArrowTableHandler`` resolves the active arrow hasher lazily via ``get_default_context()`` at hash time, breaking the construction-time circular dependency. Args: - registry: The ``PythonTypeSemanticHasherRegistry`` to populate. + registry: The ``PythonTypeHandlerRegistry`` to populate. file_hasher: Optional ``FileContentHasherProtocol`` for path hashing. Defaults to ``BasicFileHasher(sha256)``. function_info_extractor: Optional ``FunctionInfoExtractorProtocol``. @@ -265,41 +265,41 @@ def register_builtin_python_type_semantic_hashers( include_defaults=True, ) - bytes_hasher = BytesSemanticHasher() + bytes_hasher = BytesHandler() registry.register(bytes, bytes_hasher) registry.register(bytearray, bytes_hasher) - registry.register(Path, PathSemanticHasher(file_hasher)) - registry.register(UPath, UPathSemanticHasher(file_hasher)) - registry.register(UUID, UUIDSemanticHasher()) + registry.register(Path, PathHandler(file_hasher)) + registry.register(UPath, UPathHandler(file_hasher)) + registry.register(UUID, UUIDHandler()) import types as _types - function_hasher = FunctionSemanticHasher(function_info_extractor) + function_hasher = FunctionHandler(function_info_extractor) registry.register(_types.FunctionType, function_hasher) registry.register(_types.BuiltinFunctionType, function_hasher) registry.register(_types.MethodType, function_hasher) - registry.register(type, TypeObjectSemanticHasher()) - registry.register(_types.UnionType, UnionTypeSemanticHasher()) + registry.register(type, TypeObjectHandler()) + registry.register(_types.UnionType, UnionTypeHandler()) - generic_alias_hasher = GenericAliasSemanticHasher() + generic_alias_hasher = GenericAliasHandler() registry.register(_types.GenericAlias, generic_alias_hasher) try: import typing as _typing registry.register(_typing._GenericAlias, generic_alias_hasher) # type: ignore[attr-defined] - registry.register(_typing._SpecialForm, SpecialFormSemanticHasher()) # type: ignore[attr-defined] + registry.register(_typing._SpecialForm, SpecialFormHandler()) # type: ignore[attr-defined] except AttributeError: pass - registry.register(Schema, SchemaSemanticHasher()) + registry.register(Schema, SchemaHandler()) import pyarrow as _pa - arrow_table_hasher = ArrowTableSemanticHasher(arrow_hasher) + arrow_table_hasher = ArrowTableHandler(arrow_hasher) registry.register(_pa.Table, arrow_table_hasher) registry.register(_pa.RecordBatch, arrow_table_hasher) logger.debug( - "register_builtin_python_type_semantic_hashers: registered %d hashers", + "register_builtin_python_type_handlers: registered %d hashers", len(registry), ) diff --git a/src/orcapod/hashing/semantic_hashing/semantic_hasher.py b/src/orcapod/hashing/semantic_hashing/semantic_hasher.py index 24211741..a77b2750 100644 --- a/src/orcapod/hashing/semantic_hashing/semantic_hasher.py +++ b/src/orcapod/hashing/semantic_hashing/semantic_hasher.py @@ -71,7 +71,7 @@ from collections.abc import Callable, Mapping from typing import Any -from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeSemanticHasherRegistry +from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeHandlerRegistry from orcapod.protocols import hashing_protocols as hp from orcapod.types import ContentHash @@ -90,8 +90,8 @@ class SemanticAwarePythonHasher: hasher_id: A short string identifying this hasher version/configuration. Embedded in every ContentHash produced. - type_semantic_hasher_registry: - ``PythonTypeSemanticHasherRegistry`` for MRO-aware lookup of + type_handler_registry: + ``PythonTypeHandlerRegistry`` for MRO-aware lookup of ``PythonTypeHandler`` instances. If None, the default registry is used. strict: @@ -102,17 +102,17 @@ class SemanticAwarePythonHasher: def __init__( self, hasher_id: str, - type_semantic_hasher_registry: PythonTypeSemanticHasherRegistry | None = None, + type_handler_registry: PythonTypeHandlerRegistry | None = None, strict: bool = True, ) -> None: self._hasher_id = hasher_id self._strict = strict - if type_semantic_hasher_registry is None: - from orcapod.hashing.defaults import get_default_python_type_semantic_hasher_registry - self._registry = get_default_python_type_semantic_hasher_registry() + if type_handler_registry is None: + from orcapod.hashing.defaults import get_default_python_type_handler_registry + self._registry = get_default_python_type_handler_registry() else: - self._registry = type_semantic_hasher_registry + self._registry = type_handler_registry # ------------------------------------------------------------------ # Public API @@ -127,8 +127,8 @@ def strict(self) -> bool: return self._strict @property - def type_semantic_hasher_registry(self) -> PythonTypeSemanticHasherRegistry: - """Return the ``PythonTypeSemanticHasherRegistry`` used by this hasher.""" + def type_handler_registry(self) -> PythonTypeHandlerRegistry: + """Return the ``PythonTypeHandlerRegistry`` used by this hasher.""" return self._registry def hash_object( @@ -392,7 +392,7 @@ def _handle_unknown(self, obj: Any) -> str: f"SemanticAwarePythonHasher (strict): no PythonTypeHandler " f"registered for type '{qualified}' and it does not implement " "ContentIdentifiableProtocol. Register a PythonTypeHandler " - "via the PythonTypeSemanticHasherRegistry or implement " + "via the PythonTypeHandlerRegistry or implement " "identity_structure() on the class." ) diff --git a/src/orcapod/hashing/semantic_hashing/type_handler_registry.py b/src/orcapod/hashing/semantic_hashing/type_handler_registry.py index decfaf0c..1fcc46b9 100644 --- a/src/orcapod/hashing/semantic_hashing/type_handler_registry.py +++ b/src/orcapod/hashing/semantic_hashing/type_handler_registry.py @@ -1,5 +1,5 @@ """ -PythonTypeSemanticHasherRegistry — MRO-aware registry for PythonTypeHandler instances. +PythonTypeHandlerRegistry — MRO-aware registry for PythonTypeHandler instances. ``PythonTypeHandler`` is the protocol for type-specific handlers; this registry provides MRO-aware lookup so subclasses inherit their parent's handler. @@ -20,7 +20,7 @@ logger = logging.getLogger(__name__) -class PythonTypeSemanticHasherRegistry: +class PythonTypeHandlerRegistry: """Registry mapping Python types to PythonTypeHandler instances. Lookup is MRO-aware: when no hasher is registered for the exact type of @@ -68,7 +68,7 @@ def register(self, target_type: type, handler: "PythonTypeHandler") -> None: existing = self._handlers.get(target_type) if existing is not None and existing is not handler: logger.debug( - "PythonTypeSemanticHasherRegistry: replacing existing hasher for %s (%s -> %s)", + "PythonTypeHandlerRegistry: replacing existing hasher for %s (%s -> %s)", target_type.__name__, type(existing).__name__, type(handler).__name__, @@ -108,7 +108,7 @@ def get_semantic_hasher(self, obj: Any) -> "PythonTypeHandler | None": handler = self._handlers.get(base) if handler is not None: logger.debug( - "PythonTypeSemanticHasherRegistry: resolved hasher for %s via base %s", + "PythonTypeHandlerRegistry: resolved hasher for %s via base %s", obj_type.__name__, base.__name__, ) @@ -152,28 +152,28 @@ def registered_types(self) -> list[type]: def __repr__(self) -> str: with self._lock: names = [t.__name__ for t in self._handlers] - return f"PythonTypeSemanticHasherRegistry(registered={names!r})" + return f"PythonTypeHandlerRegistry(registered={names!r})" def __len__(self) -> int: with self._lock: return len(self._handlers) -def get_default_python_type_semantic_hasher_registry() -> "PythonTypeSemanticHasherRegistry": - """Return the PythonTypeSemanticHasherRegistry from the default data context. +def get_default_python_type_handler_registry() -> "PythonTypeHandlerRegistry": + """Return the PythonTypeHandlerRegistry from the default data context. This is a convenience wrapper; the registry is owned and versioned by the active ``DataContext``. Importing this function from ``orcapod.hashing.defaults`` or ``orcapod.hashing`` is equivalent. """ from orcapod.hashing.defaults import ( - get_default_python_type_semantic_hasher_registry as _get, + get_default_python_type_handler_registry as _get, ) return _get() -class BuiltinPythonTypeSemanticHasherRegistry(PythonTypeSemanticHasherRegistry): - """A PythonTypeSemanticHasherRegistry pre-populated with all built-in hashers. +class BuiltinPythonTypeHandlerRegistry(PythonTypeHandlerRegistry): + """A PythonTypeHandlerRegistry pre-populated with all built-in hashers. Constructed via the data context JSON spec so that the default registry is versioned alongside the rest of the context components. @@ -182,6 +182,6 @@ class BuiltinPythonTypeSemanticHasherRegistry(PythonTypeSemanticHasherRegistry): def __init__(self, arrow_hasher: "ArrowHasherProtocol | None" = None) -> None: super().__init__() from orcapod.hashing.semantic_hashing.builtin_handlers import ( - register_builtin_python_type_semantic_hashers, + register_builtin_python_type_handlers, ) - register_builtin_python_type_semantic_hashers(self, arrow_hasher=arrow_hasher) + register_builtin_python_type_handlers(self, arrow_hasher=arrow_hasher) diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index a7fed13f..428e065b 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -49,7 +49,7 @@ def get_versioned_semantic_hasher( hasher_id: str = _CURRENT_SEMANTIC_HASHER_ID, strict: bool = True, - type_semantic_hasher_registry: "Any | None" = None, + type_handler_registry: "Any | None" = None, ) -> hp.SemanticHasherProtocol: """Return a SemanticAwarePythonHasher configured for the current version. @@ -60,17 +60,17 @@ def get_versioned_semantic_hasher( strict: When True raises TypeError for unhandled types. When False falls back to a best-effort string representation. - type_semantic_hasher_registry: - Optional ``PythonTypeSemanticHasherRegistry`` to inject. When None the + type_handler_registry: + Optional ``PythonTypeHandlerRegistry`` to inject. When None the global default registry is used. """ from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher - if type_semantic_hasher_registry is None: + if type_handler_registry is None: from orcapod.hashing.semantic_hashing.type_handler_registry import ( - get_default_python_type_semantic_hasher_registry, + get_default_python_type_handler_registry, ) - type_semantic_hasher_registry = get_default_python_type_semantic_hasher_registry() + type_handler_registry = get_default_python_type_handler_registry() logger.debug( "get_versioned_semantic_hasher: creating SemanticAwarePythonHasher " @@ -80,7 +80,7 @@ def get_versioned_semantic_hasher( ) return SemanticAwarePythonHasher( hasher_id=hasher_id, - type_semantic_hasher_registry=type_semantic_hasher_registry, + type_handler_registry=type_handler_registry, strict=strict, ) diff --git a/src/orcapod/hashing/visitors.py b/src/orcapod/hashing/visitors.py index ee0da7d5..feeab8d6 100644 --- a/src/orcapod/hashing/visitors.py +++ b/src/orcapod/hashing/visitors.py @@ -210,7 +210,7 @@ def visit_extension( return extension_type, storage_value # Only hash if a semantic hasher is registered for this Python type. - if not self._python_hasher.type_semantic_hasher_registry.has_semantic_hasher( + if not self._python_hasher.type_handler_registry.has_semantic_hasher( python_type ): return extension_type, storage_value diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index d2a2f890..33469fc9 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -9,7 +9,7 @@ if TYPE_CHECKING: import pyarrow as pa - from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeSemanticHasherRegistry + from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeHandlerRegistry from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher @@ -55,7 +55,7 @@ class PythonTypeHandler(Protocol): A ``PythonTypeHandler`` converts a specific Python type into a representative Python structure that ``SemanticAwarePythonHasher.hash_object()`` can then hash. Implementations are registered with a - ``PythonTypeSemanticHasherRegistry`` and looked up via MRO-aware resolution. + ``PythonTypeHandlerRegistry`` and looked up via MRO-aware resolution. Each implementation receives the full ``SemanticAwarePythonHasher`` so it can delegate hashing of sub-values back to the outer hasher without coupling to a @@ -102,8 +102,8 @@ def hasher_id(self) -> str: ... @property - def type_semantic_hasher_registry(self) -> "PythonTypeSemanticHasherRegistry": - """Return the PythonTypeSemanticHasherRegistry used by this hasher.""" + def type_handler_registry(self) -> "PythonTypeHandlerRegistry": + """Return the PythonTypeHandlerRegistry used by this hasher.""" ... diff --git a/superpowers/plans/2026-06-24-rename-semantic-hasher-to-handler.md b/superpowers/plans/2026-06-24-rename-semantic-hasher-to-handler.md new file mode 100644 index 00000000..d33489a3 --- /dev/null +++ b/superpowers/plans/2026-06-24-rename-semantic-hasher-to-handler.md @@ -0,0 +1,422 @@ +# Rename *SemanticHasher → *Handler, PythonTypeSemanticHasherRegistry → PythonTypeHandlerRegistry + +> **For agentic workers:** REQUIRED SUB-SKILL: Use sensei:subagent-driven-development (recommended) or sensei:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Mechanically rename all `*SemanticHasher` handler classes to `*Handler`, all `PythonTypeSemanticHasherRegistry` variants to `PythonTypeHandlerRegistry`, and the `type_semantic_hasher_registry` param/property to `type_handler_registry` — no logic changes. + +**Architecture:** Pure find-and-replace of identifiers across ~10 source files and 2 JSON configs. Every old name maps 1-to-1 to a new name. No logic, no interface changes, no backward-compat shims (greenfield project). + +**Tech Stack:** Python, JSON, uv/pytest + +--- + +## File Map + +| File | What changes | +|---|---| +| `src/orcapod/hashing/semantic_hashing/builtin_handlers.py` | 11 class names + function name + docstring/string literals | +| `src/orcapod/hashing/semantic_hashing/type_handler_registry.py` | 3 class/function names + docstrings + internal log strings | +| `src/orcapod/hashing/semantic_hashing/semantic_hasher.py` | param + property name `type_semantic_hasher_registry` → `type_handler_registry` + docstring | +| `src/orcapod/hashing/semantic_hashing/__init__.py` | imports + `__all__` | +| `src/orcapod/hashing/__init__.py` | imports + `__all__` | +| `src/orcapod/hashing/defaults.py` | function name + import + docstring | +| `src/orcapod/hashing/versioned_hashers.py` | param name + import | +| `src/orcapod/protocols/hashing_protocols.py` | property name in `SemanticHasherProtocol` + TYPE_CHECKING import | +| `src/orcapod/contexts/data/v0.1.json` | top-level key, `_class` values, `_ref` value, sub-key | +| `src/orcapod/contexts/data/schemas/context_schema.json` | property key | +| `tests/test_hashing/test_semantic_hasher.py` | imports + usage | +| `tests/test_hashing/test_uuid_handler.py` | imports + usage | +| `tests/test_hashing/test_extension_type_hashing.py` | no old names (already clean) | +| `test-objective/unit/test_hashing.py` | imports, class names, type annotations, comments | + +--- + +## Rename Reference Table + +### Handler classes (builtin_handlers.py + all callers) + +| Old | New | +|---|---| +| `PathSemanticHasher` | `PathHandler` | +| `UPathSemanticHasher` | `UPathHandler` | +| `UUIDSemanticHasher` | `UUIDHandler` | +| `BytesSemanticHasher` | `BytesHandler` | +| `FunctionSemanticHasher` | `FunctionHandler` | +| `TypeObjectSemanticHasher` | `TypeObjectHandler` | +| `SpecialFormSemanticHasher` | `SpecialFormHandler` | +| `GenericAliasSemanticHasher` | `GenericAliasHandler` | +| `UnionTypeSemanticHasher` | `UnionTypeHandler` | +| `ArrowTableSemanticHasher` | `ArrowTableHandler` | +| `SchemaSemanticHasher` | `SchemaHandler` | +| `register_builtin_python_type_semantic_hashers` | `register_builtin_python_type_handlers` | + +### Registry classes (type_handler_registry.py + all callers) + +| Old | New | +|---|---| +| `PythonTypeSemanticHasherRegistry` | `PythonTypeHandlerRegistry` | +| `BuiltinPythonTypeSemanticHasherRegistry` | `BuiltinPythonTypeHandlerRegistry` | +| `get_default_python_type_semantic_hasher_registry` | `get_default_python_type_handler_registry` | + +### Parameter/property (semantic_hasher.py + all callers) + +| Old | New | +|---|---| +| `type_semantic_hasher_registry` | `type_handler_registry` | + +--- + +## Task 1: Rename class definitions and internal strings in `builtin_handlers.py` + +**Files:** +- Modify: `src/orcapod/hashing/semantic_hashing/builtin_handlers.py` + +- [ ] **Step 1: Apply all renames in builtin_handlers.py** + + Changes needed (all are identifier or string-literal renames only): + - Module docstring: update all `*SemanticHasher` names and `register_builtin_python_type_semantic_hashers` + - TYPE_CHECKING import: `PythonTypeSemanticHasherRegistry` → `PythonTypeHandlerRegistry` + - All 11 class definitions: `class PathSemanticHasher` → `class PathHandler`, etc. + - Error messages inside class bodies: e.g. `"PathSemanticHasher: path does not exist"` → `"PathHandler: path does not exist"` + - `logger.debug` strings: e.g. `"PathSemanticHasher: hashing file content"` → `"PathHandler: hashing file content"` + - Function `register_builtin_python_type_semantic_hashers` → `register_builtin_python_type_handlers` + - Docstring inside that function: update `PythonTypeSemanticHasherRegistry` → `PythonTypeHandlerRegistry` + - Final `logger.debug` string: `"register_builtin_python_type_semantic_hashers: registered %d hashers"` → `"register_builtin_python_type_handlers: registered %d hashers"` + +- [ ] **Step 2: Verify file parses correctly** + + ```bash + cd /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python + uv run python -c "from orcapod.hashing.semantic_hashing import builtin_handlers; print('OK')" + ``` + Expected: `OK` + +--- + +## Task 2: Rename class definitions in `type_handler_registry.py` + +**Files:** +- Modify: `src/orcapod/hashing/semantic_hashing/type_handler_registry.py` + +- [ ] **Step 1: Apply all renames in type_handler_registry.py** + + Changes needed: + - Module docstring: `PythonTypeSemanticHasherRegistry` → `PythonTypeHandlerRegistry` + - Class `PythonTypeSemanticHasherRegistry` → `PythonTypeHandlerRegistry` + - `__repr__` method: `"PythonTypeSemanticHasherRegistry(registered=..."` → `"PythonTypeHandlerRegistry(registered=..."` + - `logger.debug` strings that mention `PythonTypeSemanticHasherRegistry` + - Function `get_default_python_type_semantic_hasher_registry` → `get_default_python_type_handler_registry` + - The function body's import: `get_default_python_type_semantic_hasher_registry as _get` → `get_default_python_type_handler_registry as _get` + - Class `BuiltinPythonTypeSemanticHasherRegistry` → `BuiltinPythonTypeHandlerRegistry` + - Docstring: `"A PythonTypeSemanticHasherRegistry pre-populated..."` → `"A PythonTypeHandlerRegistry pre-populated..."` + - `super().__init__()` call — no change needed + - Import inside `__init__`: `register_builtin_python_type_semantic_hashers` → `register_builtin_python_type_handlers` + - Call: `register_builtin_python_type_semantic_hashers(self, ...)` → `register_builtin_python_type_handlers(self, ...)` + +- [ ] **Step 2: Verify file parses correctly** + + ```bash + cd /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python + uv run python -c "from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeHandlerRegistry; print('OK')" + ``` + Expected: `OK` + +--- + +## Task 3: Rename param/property in `semantic_hasher.py` + +**Files:** +- Modify: `src/orcapod/hashing/semantic_hashing/semantic_hasher.py` + +- [ ] **Step 1: Apply renames in semantic_hasher.py** + + Changes needed: + - Import: `PythonTypeSemanticHasherRegistry` → `PythonTypeHandlerRegistry` + - Docstring parameter: `type_semantic_hasher_registry:` → `type_handler_registry:` + - Constructor param: `type_semantic_hasher_registry: PythonTypeHandlerRegistry | None = None` → `type_handler_registry: PythonTypeHandlerRegistry | None = None` + - Constructor body: `if type_semantic_hasher_registry is None:` → `if type_handler_registry is None:` + - Constructor body: `from orcapod.hashing.defaults import get_default_python_type_semantic_hasher_registry` → `get_default_python_type_handler_registry` + - Constructor body: `self._registry = get_default_python_type_semantic_hasher_registry()` → `get_default_python_type_handler_registry()` + - Constructor body: `else: self._registry = type_semantic_hasher_registry` → `else: self._registry = type_handler_registry` + - Property `type_semantic_hasher_registry` → `type_handler_registry` + - Property docstring: `"Return the ``PythonTypeSemanticHasherRegistry``..."` → `"Return the ``PythonTypeHandlerRegistry``..."` + - Property return type annotation: `PythonTypeSemanticHasherRegistry` → `PythonTypeHandlerRegistry` + - Error message in `_handle_unknown`: `"via the PythonTypeSemanticHasherRegistry or"` → `"via the PythonTypeHandlerRegistry or"` + +- [ ] **Step 2: Verify file parses correctly** + + ```bash + cd /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python + uv run python -c "from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher; print('OK')" + ``` + Expected: `OK` + +--- + +## Task 4: Update `semantic_hashing/__init__.py` + +**Files:** +- Modify: `src/orcapod/hashing/semantic_hashing/__init__.py` + +- [ ] **Step 1: Apply renames** + + Changes needed: + - Module docstring: all `*SemanticHasher` names → `*Handler` equivalents + - Import from `builtin_handlers`: `BytesSemanticHasher` → `BytesHandler`, etc.; `register_builtin_python_type_semantic_hashers` → `register_builtin_python_type_handlers` + - Import from `type_handler_registry`: `BuiltinPythonTypeSemanticHasherRegistry` → `BuiltinPythonTypeHandlerRegistry`, `PythonTypeSemanticHasherRegistry` → `PythonTypeHandlerRegistry` + - `__all__`: update all entries to new names + +- [ ] **Step 2: Verify** + + ```bash + cd /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python + uv run python -c "from orcapod.hashing.semantic_hashing import PathHandler, PythonTypeHandlerRegistry, register_builtin_python_type_handlers; print('OK')" + ``` + Expected: `OK` + +--- + +## Task 5: Update `hashing/__init__.py` + +**Files:** +- Modify: `src/orcapod/hashing/__init__.py` + +- [ ] **Step 1: Apply renames** + + Changes needed: + - Module docstring: update all old names + - Import from `defaults`: `get_default_python_type_semantic_hasher_registry` → `get_default_python_type_handler_registry` + - Import from `builtin_handlers`: `BytesSemanticHasher` → `BytesHandler`, etc.; `register_builtin_python_type_semantic_hashers` → `register_builtin_python_type_handlers` + - Import from `type_handler_registry`: `BuiltinPythonTypeSemanticHasherRegistry` → `BuiltinPythonTypeHandlerRegistry`, `PythonTypeSemanticHasherRegistry` → `PythonTypeHandlerRegistry` + - `__all__`: update all entries to new names + +- [ ] **Step 2: Verify** + + ```bash + cd /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python + uv run python -c "from orcapod.hashing import PythonTypeHandlerRegistry, get_default_python_type_handler_registry, BytesHandler; print('OK')" + ``` + Expected: `OK` + +--- + +## Task 6: Update `hashing/defaults.py` + +**Files:** +- Modify: `src/orcapod/hashing/defaults.py` + +- [ ] **Step 1: Apply renames** + + Changes needed: + - Import: `PythonTypeSemanticHasherRegistry` → `PythonTypeHandlerRegistry` + - Function name: `get_default_python_type_semantic_hasher_registry` → `get_default_python_type_handler_registry` + - Return type annotation: `PythonTypeSemanticHasherRegistry` → `PythonTypeHandlerRegistry` + - Docstring: update class name references + - Function body: `get_default_context().semantic_hasher.type_semantic_hasher_registry` → `get_default_context().semantic_hasher.type_handler_registry` + +- [ ] **Step 2: Verify** + + ```bash + cd /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python + uv run python -c "from orcapod.hashing.defaults import get_default_python_type_handler_registry; print('OK')" + ``` + Expected: `OK` + +--- + +## Task 7: Update `hashing/versioned_hashers.py` + +**Files:** +- Modify: `src/orcapod/hashing/versioned_hashers.py` + +- [ ] **Step 1: Apply renames** + + Changes needed: + - Function param: `type_semantic_hasher_registry: "Any | None" = None` → `type_handler_registry: "Any | None" = None` + - Docstring param description: `type_semantic_hasher_registry:` → `type_handler_registry:` + - Import inside function: `get_default_python_type_semantic_hasher_registry` → `get_default_python_type_handler_registry` + - Variable: `type_semantic_hasher_registry = get_default_python_type_semantic_hasher_registry()` → `type_handler_registry = get_default_python_type_handler_registry()` + - `SemanticAwarePythonHasher(... type_semantic_hasher_registry=type_semantic_hasher_registry ...)` → `... type_handler_registry=type_handler_registry ...` + +- [ ] **Step 2: Verify** + + ```bash + cd /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python + uv run python -c "from orcapod.hashing.versioned_hashers import get_versioned_semantic_hasher; print('OK')" + ``` + Expected: `OK` + +--- + +## Task 8: Update `protocols/hashing_protocols.py` + +**Files:** +- Modify: `src/orcapod/protocols/hashing_protocols.py` + +- [ ] **Step 1: Apply renames** + + Changes needed: + - TYPE_CHECKING import: `PythonTypeSemanticHasherRegistry` → `PythonTypeHandlerRegistry` + - `SemanticHasherProtocol.type_semantic_hasher_registry` property → `type_handler_registry` + - Property docstring: `"Return the PythonTypeSemanticHasherRegistry..."` → `"Return the PythonTypeHandlerRegistry..."` + - Property return type annotation: `"PythonTypeSemanticHasherRegistry"` → `"PythonTypeHandlerRegistry"` + +- [ ] **Step 2: Verify** + + ```bash + cd /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python + uv run python -c "from orcapod.protocols.hashing_protocols import SemanticHasherProtocol; print('OK')" + ``` + Expected: `OK` + +--- + +## Task 9: Update `contexts/data/v0.1.json` + +**Files:** +- Modify: `src/orcapod/contexts/data/v0.1.json` + +- [ ] **Step 1: Apply renames** + + Changes needed (4 renames): + 1. Top-level key `"python_type_semantic_hasher_registry"` → `"python_type_handler_registry"` + 2. All `"_class"` values with `*SemanticHasher` suffix — e.g.: + - `"...builtin_handlers.BytesSemanticHasher"` → `"...builtin_handlers.BytesHandler"` + - `"...builtin_handlers.PathSemanticHasher"` → `"...builtin_handlers.PathHandler"` + - `"...builtin_handlers.UPathSemanticHasher"` → `"...builtin_handlers.UPathHandler"` + - `"...builtin_handlers.UUIDSemanticHasher"` → `"...builtin_handlers.UUIDHandler"` + - `"...builtin_handlers.FunctionSemanticHasher"` → `"...builtin_handlers.FunctionHandler"` + - `"...builtin_handlers.TypeObjectSemanticHasher"` → `"...builtin_handlers.TypeObjectHandler"` + - `"...builtin_handlers.GenericAliasSemanticHasher"` → `"...builtin_handlers.GenericAliasHandler"` + - `"...builtin_handlers.UnionTypeSemanticHasher"` → `"...builtin_handlers.UnionTypeHandler"` + - `"...builtin_handlers.SpecialFormSemanticHasher"` → `"...builtin_handlers.SpecialFormHandler"` + - `"...builtin_handlers.ArrowTableSemanticHasher"` → `"...builtin_handlers.ArrowTableHandler"` + - `"...type_handler_registry.PythonTypeSemanticHasherRegistry"` → `"...type_handler_registry.PythonTypeHandlerRegistry"` + 3. Inside `semantic_hasher._config`: sub-key `"type_semantic_hasher_registry"` → `"type_handler_registry"` + 4. Inside `semantic_hasher._config.type_handler_registry`: `"_ref": "python_type_semantic_hasher_registry"` → `"_ref": "python_type_handler_registry"` + +- [ ] **Step 2: Verify JSON is valid and context loads** + + ```bash + cd /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python + uv run python -c "import json; json.load(open('src/orcapod/contexts/data/v0.1.json')); print('JSON OK')" + uv run python -c "from orcapod.contexts import get_default_context; ctx = get_default_context(); print('Context OK')" + ``` + Expected: `JSON OK` then `Context OK` + +--- + +## Task 10: Update `contexts/data/schemas/context_schema.json` + +**Files:** +- Modify: `src/orcapod/contexts/data/schemas/context_schema.json` + +- [ ] **Step 1: Apply renames** + + Changes needed: + - Property key `"python_type_semantic_hasher_registry"` → `"python_type_handler_registry"` (in `properties` section) + - Description string within that property: `"ObjectSpec for the PythonTypeSemanticHasherRegistry..."` → `"ObjectSpec for the PythonTypeHandlerRegistry..."` + - In the `examples` section: `"type_semantic_hasher_registry"` sub-key → `"type_handler_registry"`, and `"_ref": "python_type_semantic_hasher_registry"` → `"_ref": "python_type_handler_registry"` + +- [ ] **Step 2: Verify JSON is valid** + + ```bash + cd /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python + uv run python -c "import json; json.load(open('src/orcapod/contexts/data/schemas/context_schema.json')); print('Schema JSON OK')" + ``` + Expected: `Schema JSON OK` + +--- + +## Task 11: Update test files + +**Files:** +- Modify: `tests/test_hashing/test_semantic_hasher.py` +- Modify: `tests/test_hashing/test_uuid_handler.py` +- Modify: `test-objective/unit/test_hashing.py` + +- [ ] **Step 1: Update `tests/test_hashing/test_semantic_hasher.py`** + + Changes needed: + - Import: `register_builtin_python_type_semantic_hashers` → `register_builtin_python_type_handlers` + - Import: `PythonTypeSemanticHasherRegistry` → `PythonTypeHandlerRegistry` + - Import: `get_default_python_type_semantic_hasher_registry` → `get_default_python_type_handler_registry` + - `make_hasher` body: `registry = PythonTypeSemanticHasherRegistry()` → `PythonTypeHandlerRegistry()`, `register_builtin_python_type_semantic_hashers(registry)` → `register_builtin_python_type_handlers(registry)`, `type_semantic_hasher_registry=registry` → `type_handler_registry=registry` + - All other usages of these names throughout the file (type annotations, variable names, docstrings, comments) + +- [ ] **Step 2: Update `tests/test_hashing/test_uuid_handler.py`** + + Changes needed: + - Import: `register_builtin_python_type_semantic_hashers` → `register_builtin_python_type_handlers` + - Import: `PythonTypeSemanticHasherRegistry` → `PythonTypeHandlerRegistry` + - `_make_hasher` body: same pattern as above + - `type_semantic_hasher_registry=registry` → `type_handler_registry=registry` + +- [ ] **Step 3: Update `test-objective/unit/test_hashing.py`** + + Changes needed (this file has many occurrences — all follow the same pattern): + - Imports: `PythonTypeSemanticHasherRegistry` → `PythonTypeHandlerRegistry`, `BuiltinPythonTypeSemanticHasherRegistry` → `BuiltinPythonTypeHandlerRegistry` + - All fixture/function type annotations: `PythonTypeSemanticHasherRegistry` → `PythonTypeHandlerRegistry` + - All constructor calls: `type_semantic_hasher_registry=registry` → `type_handler_registry=registry` + - All class names in test bodies: `PythonTypeSemanticHasherRegistry()` → `PythonTypeHandlerRegistry()` + - All `BuiltinPythonTypeSemanticHasherRegistry()` → `BuiltinPythonTypeHandlerRegistry()` + - All comments/docstrings mentioning old names + +- [ ] **Step 4: Verify test files parse** + + ```bash + cd /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python + uv run python -m py_compile tests/test_hashing/test_semantic_hasher.py && echo "OK" + uv run python -m py_compile tests/test_hashing/test_uuid_handler.py && echo "OK" + uv run python -m py_compile test-objective/unit/test_hashing.py && echo "OK" + ``` + Expected: three `OK` lines + +--- + +## Task 12: Run tests and commit + +- [ ] **Step 1: Run hashing tests** + + ```bash + cd /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python + uv run pytest tests/test_hashing/ -x -q + ``` + Expected: all tests pass + +- [ ] **Step 2: Run full test suite (excluding deleted semantic types)** + + ```bash + cd /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python + uv run pytest tests/ -x -q --ignore=tests/test_semantic_types + ``` + Expected: all tests pass + +- [ ] **Step 3: Confirm no remaining old names in source** + + ```bash + cd /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python + grep -rn "PathSemanticHasher\|UPathSemanticHasher\|UUIDSemanticHasher\|BytesSemanticHasher\|FunctionSemanticHasher\|TypeObjectSemanticHasher\|SpecialFormSemanticHasher\|GenericAliasSemanticHasher\|UnionTypeSemanticHasher\|ArrowTableSemanticHasher\|SchemaSemanticHasher\|PythonTypeSemanticHasherRegistry\|BuiltinPythonTypeSemanticHasherRegistry\|get_default_python_type_semantic_hasher_registry\|register_builtin_python_type_semantic_hashers\|type_semantic_hasher_registry" src/ tests/ test-objective/ --include="*.py" --include="*.json" | grep -v "^Binary" + ``` + Expected: no matches (zero lines) + +- [ ] **Step 4: Commit** + + ```bash + cd /home/kurouto/kurouto-jobs/dc15d84f-7281-48b5-9e17-435e9a04f175/orcapod-python + git add src/orcapod/hashing/semantic_hashing/builtin_handlers.py + git add src/orcapod/hashing/semantic_hashing/type_handler_registry.py + git add src/orcapod/hashing/semantic_hashing/semantic_hasher.py + git add src/orcapod/hashing/semantic_hashing/__init__.py + git add src/orcapod/hashing/__init__.py + git add src/orcapod/hashing/defaults.py + git add src/orcapod/hashing/versioned_hashers.py + git add src/orcapod/protocols/hashing_protocols.py + git add src/orcapod/contexts/data/v0.1.json + git add src/orcapod/contexts/data/schemas/context_schema.json + git add tests/test_hashing/test_semantic_hasher.py + git add tests/test_hashing/test_uuid_handler.py + git add test-objective/unit/test_hashing.py + git add superpowers/plans/2026-06-24-rename-semantic-hasher-to-handler.md + git commit -m "refactor(hashing): rename *SemanticHasher → *Handler, PythonTypeSemanticHasherRegistry → PythonTypeHandlerRegistry" + ``` diff --git a/test-objective/unit/test_hashing.py b/test-objective/unit/test_hashing.py index 695c01cd..0ef408f4 100644 --- a/test-objective/unit/test_hashing.py +++ b/test-objective/unit/test_hashing.py @@ -1,4 +1,4 @@ -"""Tests for SemanticAwarePythonHasher and PythonTypeSemanticHasherRegistry. +"""Tests for SemanticAwarePythonHasher and PythonTypeHandlerRegistry. Specification-derived tests covering deterministic hashing of primitives, structures, ContentHash pass-through, identity_structure resolution, @@ -15,8 +15,8 @@ from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher from orcapod.hashing.semantic_hashing.type_handler_registry import ( - BuiltinPythonTypeSemanticHasherRegistry, - PythonTypeSemanticHasherRegistry, + BuiltinPythonTypeHandlerRegistry, + PythonTypeHandlerRegistry, ) from orcapod.types import ContentHash @@ -27,27 +27,27 @@ @pytest.fixture -def registry() -> PythonTypeSemanticHasherRegistry: - """An empty PythonTypeSemanticHasherRegistry.""" - return PythonTypeSemanticHasherRegistry() +def registry() -> PythonTypeHandlerRegistry: + """An empty PythonTypeHandlerRegistry.""" + return PythonTypeHandlerRegistry() @pytest.fixture -def hasher(registry: PythonTypeSemanticHasherRegistry) -> SemanticAwarePythonHasher: +def hasher(registry: PythonTypeHandlerRegistry) -> SemanticAwarePythonHasher: """A strict SemanticAwarePythonHasher backed by an empty registry.""" return SemanticAwarePythonHasher( hasher_id="test_v1", - type_semantic_hasher_registry=registry, + type_handler_registry=registry, strict=True, ) @pytest.fixture -def lenient_hasher(registry: PythonTypeSemanticHasherRegistry) -> SemanticAwarePythonHasher: +def lenient_hasher(registry: PythonTypeHandlerRegistry) -> SemanticAwarePythonHasher: """A non-strict SemanticAwarePythonHasher backed by an empty registry.""" return SemanticAwarePythonHasher( hasher_id="test_v1", - type_semantic_hasher_registry=registry, + type_handler_registry=registry, strict=False, ) @@ -80,7 +80,7 @@ def content_hash(self, hasher: Any = None) -> ContentHash: if hasher is not None: return hasher.hash_object(self.identity_structure()) h = SemanticAwarePythonHasher( - "test_v1", type_semantic_hasher_registry=PythonTypeSemanticHasherRegistry(), strict=False + "test_v1", type_handler_registry=PythonTypeHandlerRegistry(), strict=False ) return h.hash_object(self.identity_structure()) @@ -271,34 +271,34 @@ def test_true_vs_one(self, hasher: SemanticAwarePythonHasher) -> None: # =================================================================== -# PythonTypeSemanticHasherRegistry -- register/get_semantic_hasher roundtrip +# PythonTypeHandlerRegistry -- register/get_semantic_hasher roundtrip # =================================================================== -class TestPythonTypeSemanticHasherRegistryBasics: +class TestPythonTypeHandlerRegistryBasics: """register() + get_semantic_hasher() roundtrip.""" - def test_register_and_get_semantic_hasher(self, registry: PythonTypeSemanticHasherRegistry) -> None: + def test_register_and_get_semantic_hasher(self, registry: PythonTypeHandlerRegistry) -> None: handler = _FakeHandler() registry.register(int, handler) assert registry.get_semantic_hasher(42) is handler def test_get_semantic_hasher_returns_none_for_unregistered( - self, registry: PythonTypeSemanticHasherRegistry + self, registry: PythonTypeHandlerRegistry ) -> None: assert registry.get_semantic_hasher("hello") is None # =================================================================== -# PythonTypeSemanticHasherRegistry -- MRO-aware lookup +# PythonTypeHandlerRegistry -- MRO-aware lookup # =================================================================== -class TestPythonTypeSemanticHasherRegistryMRO: +class TestPythonTypeHandlerRegistryMRO: """MRO-aware lookup: handler for parent class matches subclass.""" def test_subclass_inherits_parent_handler( - self, registry: PythonTypeSemanticHasherRegistry + self, registry: PythonTypeHandlerRegistry ) -> None: class Base: pass @@ -311,7 +311,7 @@ class Child(Base): assert registry.get_semantic_hasher(Child()) is handler def test_specific_handler_overrides_parent( - self, registry: PythonTypeSemanticHasherRegistry + self, registry: PythonTypeHandlerRegistry ) -> None: class Base: pass @@ -328,41 +328,41 @@ class Child(Base): # =================================================================== -# PythonTypeSemanticHasherRegistry -- unregister +# PythonTypeHandlerRegistry -- unregister # =================================================================== -class TestPythonTypeSemanticHasherRegistryUnregister: +class TestPythonTypeHandlerRegistryUnregister: """unregister() removes handler.""" - def test_unregister_existing(self, registry: PythonTypeSemanticHasherRegistry) -> None: + def test_unregister_existing(self, registry: PythonTypeHandlerRegistry) -> None: handler = _FakeHandler() registry.register(int, handler) result = registry.unregister(int) assert result is True assert registry.get_semantic_hasher(42) is None - def test_unregister_nonexistent(self, registry: PythonTypeSemanticHasherRegistry) -> None: + def test_unregister_nonexistent(self, registry: PythonTypeHandlerRegistry) -> None: result = registry.unregister(float) assert result is False # =================================================================== -# PythonTypeSemanticHasherRegistry -- has_semantic_hasher +# PythonTypeHandlerRegistry -- has_semantic_hasher # =================================================================== -class TestPythonTypeSemanticHasherRegistryHasSemanticHasher: +class TestPythonTypeHandlerRegistryHasSemanticHasher: """has_semantic_hasher() boolean check.""" - def test_has_semantic_hasher_true(self, registry: PythonTypeSemanticHasherRegistry) -> None: + def test_has_semantic_hasher_true(self, registry: PythonTypeHandlerRegistry) -> None: registry.register(int, _FakeHandler()) assert registry.has_semantic_hasher(int) is True - def test_has_semantic_hasher_false(self, registry: PythonTypeSemanticHasherRegistry) -> None: + def test_has_semantic_hasher_false(self, registry: PythonTypeHandlerRegistry) -> None: assert registry.has_semantic_hasher(str) is False - def test_has_semantic_hasher_via_mro(self, registry: PythonTypeSemanticHasherRegistry) -> None: + def test_has_semantic_hasher_via_mro(self, registry: PythonTypeHandlerRegistry) -> None: class Base: pass @@ -374,17 +374,17 @@ class Child(Base): # =================================================================== -# PythonTypeSemanticHasherRegistry -- registered_types +# PythonTypeHandlerRegistry -- registered_types # =================================================================== -class TestPythonTypeSemanticHasherRegistryRegisteredTypes: +class TestPythonTypeHandlerRegistryRegisteredTypes: """registered_types() lists types.""" - def test_registered_types_empty(self, registry: PythonTypeSemanticHasherRegistry) -> None: + def test_registered_types_empty(self, registry: PythonTypeHandlerRegistry) -> None: assert registry.registered_types() == [] - def test_registered_types_populated(self, registry: PythonTypeSemanticHasherRegistry) -> None: + def test_registered_types_populated(self, registry: PythonTypeHandlerRegistry) -> None: registry.register(int, _FakeHandler()) registry.register(str, _FakeHandler()) types = registry.registered_types() @@ -392,14 +392,14 @@ def test_registered_types_populated(self, registry: PythonTypeSemanticHasherRegi # =================================================================== -# PythonTypeSemanticHasherRegistry -- thread safety +# PythonTypeHandlerRegistry -- thread safety # =================================================================== -class TestPythonTypeSemanticHasherRegistryThreadSafety: +class TestPythonTypeHandlerRegistryThreadSafety: """Concurrent register/lookup doesn't crash.""" - def test_concurrent_register_lookup(self, registry: PythonTypeSemanticHasherRegistry) -> None: + def test_concurrent_register_lookup(self, registry: PythonTypeHandlerRegistry) -> None: errors: list[Exception] = [] def register_types(start: int, count: int) -> None: @@ -435,13 +435,13 @@ def lookup_types() -> None: # =================================================================== -# BuiltinPythonTypeSemanticHasherRegistry +# BuiltinPythonTypeHandlerRegistry # =================================================================== -class TestBuiltinPythonTypeSemanticHasherRegistry: - """BuiltinPythonTypeSemanticHasherRegistry is pre-populated with built-in handlers.""" +class TestBuiltinPythonTypeHandlerRegistry: + """BuiltinPythonTypeHandlerRegistry is pre-populated with built-in handlers.""" def test_construction(self) -> None: - reg = BuiltinPythonTypeSemanticHasherRegistry() + reg = BuiltinPythonTypeHandlerRegistry() assert len(reg.registered_types()) > 0 diff --git a/tests/test_hashing/test_extension_type_hashing.py b/tests/test_hashing/test_extension_type_hashing.py index 56a8d822..4cace31f 100644 --- a/tests/test_hashing/test_extension_type_hashing.py +++ b/tests/test_hashing/test_extension_type_hashing.py @@ -125,14 +125,14 @@ def test_null_value_passthrough(self, ctx): def test_unregistered_python_type_passes_through(self, ctx): """Extension types with no registered semantic hasher pass through unchanged.""" import uuid - from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeSemanticHasherRegistry + from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeHandlerRegistry from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher # Build a hasher with a registry that has NO entry for UUID - empty_registry = PythonTypeSemanticHasherRegistry() + empty_registry = PythonTypeHandlerRegistry() stripped_hasher = SemanticAwarePythonHasher( hasher_id="test_v0", - type_semantic_hasher_registry=empty_registry, + type_handler_registry=empty_registry, ) arrow_type = ctx.type_converter.register_python_class(uuid.UUID) diff --git a/tests/test_hashing/test_semantic_hasher.py b/tests/test_hashing/test_semantic_hasher.py index 5b3b04a2..bb1f8c12 100644 --- a/tests/test_hashing/test_semantic_hasher.py +++ b/tests/test_hashing/test_semantic_hasher.py @@ -5,13 +5,13 @@ - SemanticAwarePythonHasher: primitives, container type-tagging, determinism, circular references, strict vs non-strict mode - ContentIdentifiableProtocol protocol: independent hashing, composability - - PythonTypeSemanticHasherRegistry: registration, MRO-aware lookup, unregister + - PythonTypeHandlerRegistry: registration, MRO-aware lookup, unregister - Built-in hashers: bytes, UUID, Path, functions, type objects - ContentHash as terminal: returned as-is without re-hashing - ContentIdentifiableMixin: content_hash, __eq__, __hash__, caching, cache invalidation, injectable hasher - Custom type hasher registration and extension - - get_default_semantic_hasher / get_default_python_type_semantic_hasher_registry + - get_default_semantic_hasher / get_default_python_type_handler_registry """ from __future__ import annotations @@ -28,7 +28,7 @@ from orcapod.hashing.defaults import get_default_semantic_hasher from orcapod.hashing.semantic_hashing.builtin_handlers import ( - register_builtin_python_type_semantic_hashers, + register_builtin_python_type_handlers, ) from orcapod.hashing.semantic_hashing.content_identifiable_mixin import ( ContentIdentifiableMixin, @@ -38,8 +38,8 @@ _is_namedtuple, ) from orcapod.hashing.semantic_hashing.type_handler_registry import ( - PythonTypeSemanticHasherRegistry, - get_default_python_type_semantic_hasher_registry, + PythonTypeHandlerRegistry, + get_default_python_type_handler_registry, ) from orcapod.types import ContentHash @@ -50,10 +50,10 @@ def make_hasher(strict: bool = True) -> SemanticAwarePythonHasher: """Create a fresh SemanticAwarePythonHasher with an isolated registry.""" - registry = PythonTypeSemanticHasherRegistry() - register_builtin_python_type_semantic_hashers(registry) + registry = PythonTypeHandlerRegistry() + register_builtin_python_type_handlers(registry) return SemanticAwarePythonHasher( - hasher_id="test_v1", type_semantic_hasher_registry=registry, strict=strict + hasher_id="test_v1", type_handler_registry=registry, strict=strict ) @@ -822,7 +822,7 @@ def test_repr_includes_hash(self, hasher): # --------------------------------------------------------------------------- -# 14. PythonTypeSemanticHasherRegistry +# 14. PythonTypeHandlerRegistry # --------------------------------------------------------------------------- @@ -847,27 +847,27 @@ class GrandChild(Child): pass -class TestPythonTypeSemanticHasherRegistry: +class TestPythonTypeHandlerRegistry: def test_register_and_get_exact(self): - reg = PythonTypeSemanticHasherRegistry() + reg = PythonTypeHandlerRegistry() h = _DummySemanticHasher("base") reg.register(Base, h) assert reg.get_semantic_hasher(Base()) is h def test_mro_lookup_child(self): - reg = PythonTypeSemanticHasherRegistry() + reg = PythonTypeHandlerRegistry() h = _DummySemanticHasher("base") reg.register(Base, h) assert reg.get_semantic_hasher(Child()) is h def test_mro_lookup_grandchild(self): - reg = PythonTypeSemanticHasherRegistry() + reg = PythonTypeHandlerRegistry() h = _DummySemanticHasher("base") reg.register(Base, h) assert reg.get_semantic_hasher(GrandChild()) is h def test_more_specific_handler_wins(self): - reg = PythonTypeSemanticHasherRegistry() + reg = PythonTypeHandlerRegistry() h_base = _DummySemanticHasher("base") h_child = _DummySemanticHasher("child") reg.register(Base, h_base) @@ -876,22 +876,22 @@ def test_more_specific_handler_wins(self): assert reg.get_semantic_hasher(GrandChild()) is h_child def test_unregistered_returns_none(self): - reg = PythonTypeSemanticHasherRegistry() + reg = PythonTypeHandlerRegistry() assert reg.get_semantic_hasher(Base()) is None def test_unregister_removes_handler(self): - reg = PythonTypeSemanticHasherRegistry() + reg = PythonTypeHandlerRegistry() h = _DummySemanticHasher("base") reg.register(Base, h) assert reg.unregister(Base) is True assert reg.get_semantic_hasher(Base()) is None def test_unregister_nonexistent_returns_false(self): - reg = PythonTypeSemanticHasherRegistry() + reg = PythonTypeHandlerRegistry() assert reg.unregister(Base) is False def test_replace_existing_handler(self): - reg = PythonTypeSemanticHasherRegistry() + reg = PythonTypeHandlerRegistry() h1 = _DummySemanticHasher("first") h2 = _DummySemanticHasher("second") reg.register(Base, h1) @@ -899,26 +899,26 @@ def test_replace_existing_handler(self): assert reg.get_semantic_hasher(Base()) is h2 def test_register_non_type_raises(self): - reg = PythonTypeSemanticHasherRegistry() + reg = PythonTypeHandlerRegistry() with pytest.raises(TypeError): reg.register("not_a_type", _DummySemanticHasher("x")) # type: ignore[arg-type] def test_has_handler_exact(self): - reg = PythonTypeSemanticHasherRegistry() + reg = PythonTypeHandlerRegistry() reg.register(Base, _DummySemanticHasher("b")) assert reg.has_semantic_hasher(Base) is True def test_has_handler_via_mro(self): - reg = PythonTypeSemanticHasherRegistry() + reg = PythonTypeHandlerRegistry() reg.register(Base, _DummySemanticHasher("b")) assert reg.has_semantic_hasher(Child) is True def test_has_handler_false(self): - reg = PythonTypeSemanticHasherRegistry() + reg = PythonTypeHandlerRegistry() assert reg.has_semantic_hasher(Base) is False def test_registered_types_snapshot(self): - reg = PythonTypeSemanticHasherRegistry() + reg = PythonTypeHandlerRegistry() reg.register(Base, _DummySemanticHasher("b")) reg.register(Child, _DummySemanticHasher("c")) types = reg.registered_types() @@ -926,7 +926,7 @@ def test_registered_types_snapshot(self): assert Child in types def test_len(self): - reg = PythonTypeSemanticHasherRegistry() + reg = PythonTypeHandlerRegistry() assert len(reg) == 0 reg.register(Base, _DummySemanticHasher("b")) assert len(reg) == 1 @@ -934,7 +934,7 @@ def test_len(self): assert len(reg) == 2 def test_get_handler_for_type(self): - reg = PythonTypeSemanticHasherRegistry() + reg = PythonTypeHandlerRegistry() h = _DummySemanticHasher("b") reg.register(Base, h) assert reg.get_semantic_hasher_for_type(Base) is h @@ -959,31 +959,31 @@ def handle(self, obj: Any, hasher: Any) -> Any: class TestCustomHandlerRegistration: def test_register_custom_type(self): - registry = PythonTypeSemanticHasherRegistry() - register_builtin_python_type_semantic_hashers(registry) + registry = PythonTypeHandlerRegistry() + register_builtin_python_type_handlers(registry) registry.register(Celsius, CelsiusHandler()) custom_hasher = SemanticAwarePythonHasher( - hasher_id="custom_v1", type_semantic_hasher_registry=registry, strict=True + hasher_id="custom_v1", type_handler_registry=registry, strict=True ) assert isinstance(custom_hasher.hash_object(Celsius(100.0)), ContentHash) def test_custom_handler_determinism(self): - registry = PythonTypeSemanticHasherRegistry() - register_builtin_python_type_semantic_hashers(registry) + registry = PythonTypeHandlerRegistry() + register_builtin_python_type_handlers(registry) registry.register(Celsius, CelsiusHandler()) custom_hasher = SemanticAwarePythonHasher( - hasher_id="custom_v1", type_semantic_hasher_registry=registry + hasher_id="custom_v1", type_handler_registry=registry ) h1 = custom_hasher.hash_object(Celsius(37.5)) h2 = custom_hasher.hash_object(Celsius(37.5)) assert h1 == h2 def test_custom_handler_different_values_differ(self): - registry = PythonTypeSemanticHasherRegistry() - register_builtin_python_type_semantic_hashers(registry) + registry = PythonTypeHandlerRegistry() + register_builtin_python_type_handlers(registry) registry.register(Celsius, CelsiusHandler()) custom_hasher = SemanticAwarePythonHasher( - hasher_id="custom_v1", type_semantic_hasher_registry=registry + hasher_id="custom_v1", type_handler_registry=registry ) assert custom_hasher.hash_object(Celsius(0.0)) != custom_hasher.hash_object( Celsius(100.0) @@ -995,11 +995,11 @@ def test_unregistered_type_still_strict(self): hasher.hash_object(Celsius(42.0)) def test_custom_handler_in_nested_structure(self): - registry = PythonTypeSemanticHasherRegistry() - register_builtin_python_type_semantic_hashers(registry) + registry = PythonTypeHandlerRegistry() + register_builtin_python_type_handlers(registry) registry.register(Celsius, CelsiusHandler()) custom_hasher = SemanticAwarePythonHasher( - hasher_id="custom_v1", type_semantic_hasher_registry=registry + hasher_id="custom_v1", type_handler_registry=registry ) h = custom_hasher.hash_object({"temp": Celsius(36.6), "unit": "C"}) assert isinstance(h, ContentHash) @@ -1011,11 +1011,11 @@ class DirectHashHandler: def handle(self, obj: Any, hasher: Any) -> ContentHash: return ContentHash("direct", b"\xaa" * 32) - registry = PythonTypeSemanticHasherRegistry() - register_builtin_python_type_semantic_hashers(registry) + registry = PythonTypeHandlerRegistry() + register_builtin_python_type_handlers(registry) registry.register(Celsius, DirectHashHandler()) custom_hasher = SemanticAwarePythonHasher( - hasher_id="custom_v1", type_semantic_hasher_registry=registry + hasher_id="custom_v1", type_handler_registry=registry ) result = custom_hasher.hash_object(Celsius(0.0)) # The ContentHash returned by the handler should come back as-is @@ -1025,11 +1025,11 @@ def test_mro_aware_custom_handler(self): class FancyCelsius(Celsius): pass - registry = PythonTypeSemanticHasherRegistry() - register_builtin_python_type_semantic_hashers(registry) + registry = PythonTypeHandlerRegistry() + register_builtin_python_type_handlers(registry) registry.register(Celsius, CelsiusHandler()) custom_hasher = SemanticAwarePythonHasher( - hasher_id="custom_v1", type_semantic_hasher_registry=registry + hasher_id="custom_v1", type_handler_registry=registry ) h = custom_hasher.hash_object(FancyCelsius(20.0)) assert isinstance(h, ContentHash) @@ -1045,7 +1045,7 @@ class KelvinHandler: def handle(self, obj: Any, hasher: Any) -> Any: return {"__type__": "Kelvin", "k": obj.k} - global_registry = get_default_python_type_semantic_hasher_registry() + global_registry = get_default_python_type_handler_registry() global_registry.register(Kelvin, KelvinHandler()) try: default_hasher = get_default_semantic_hasher() @@ -1067,8 +1067,8 @@ def test_get_default_semantic_hasher_has_versioned_id(self): assert get_default_semantic_hasher().hasher_id == "semantic_v0.1" def test_get_default_type_handler_registry_is_singleton(self): - r1 = get_default_python_type_semantic_hasher_registry() - r2 = get_default_python_type_semantic_hasher_registry() + r1 = get_default_python_type_handler_registry() + r2 = get_default_python_type_handler_registry() assert r1 is r2 def test_default_registry_has_builtin_handlers(self): @@ -1076,7 +1076,7 @@ def test_default_registry_has_builtin_handlers(self): import typing as _typing - reg = get_default_python_type_semantic_hasher_registry() + reg = get_default_python_type_handler_registry() assert reg.has_semantic_hasher(bytes) assert reg.has_semantic_hasher(bytearray) assert reg.has_semantic_hasher(UUID) @@ -1090,7 +1090,7 @@ def test_default_registry_has_builtin_handlers(self): def test_default_registry_has_no_content_hash_handler(self): """ContentHash is handled as a terminal -- no registry entry needed.""" - reg = get_default_python_type_semantic_hasher_registry() + reg = get_default_python_type_handler_registry() assert not reg.has_semantic_hasher(ContentHash) def test_default_hasher_can_hash_common_types(self): diff --git a/tests/test_hashing/test_uuid_handler.py b/tests/test_hashing/test_uuid_handler.py index 3e6fe1f8..b9e57cd9 100644 --- a/tests/test_hashing/test_uuid_handler.py +++ b/tests/test_hashing/test_uuid_handler.py @@ -1,6 +1,6 @@ -"""Tests for UUIDSemanticHasher hash() method behaviour. +"""Tests for UUIDHandler hash() method behaviour. -Verifies that UUIDSemanticHasher produces a ContentHash based on the 16-byte +Verifies that UUIDHandler produces a ContentHash based on the 16-byte binary representation of a UUID, consistent with OrcaPod's canonical ``pa.binary(16)`` Arrow storage format. """ @@ -15,21 +15,21 @@ def _make_hasher() -> SemanticAwarePythonHasher: from orcapod.hashing.semantic_hashing.builtin_handlers import ( - register_builtin_python_type_semantic_hashers, + register_builtin_python_type_handlers, ) from orcapod.hashing.semantic_hashing.type_handler_registry import ( - PythonTypeSemanticHasherRegistry, + PythonTypeHandlerRegistry, ) - registry = PythonTypeSemanticHasherRegistry() - register_builtin_python_type_semantic_hashers(registry) + registry = PythonTypeHandlerRegistry() + register_builtin_python_type_handlers(registry) return SemanticAwarePythonHasher( - hasher_id="test_v1", type_semantic_hasher_registry=registry, strict=True + hasher_id="test_v1", type_handler_registry=registry, strict=True ) def test_uuid_handler_returns_content_hash(): - """UUIDSemanticHasher should return a ContentHash for a UUID.""" + """UUIDHandler should return a ContentHash for a UUID.""" hasher = _make_hasher() u = _uuid.UUID("550e8400-e29b-41d4-a716-446655440000") result = hasher.hash_object(u) From 5c12aa0179d2121e0c9a3fc8055eaad58f7c1524 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Wed, 24 Jun 2026 23:17:55 +0000 Subject: [PATCH 27/33] =?UTF-8?q?docs(test=5Fhashing):=20update=20stale=20?= =?UTF-8?q?BaseSemanticHasher=20=E2=86=92=20SemanticAwarePythonHasher=20in?= =?UTF-8?q?=20comments?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_hashing/generate_hash_examples.py | 5 ++--- tests/test_hashing/test_hash_samples.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_hashing/generate_hash_examples.py b/tests/test_hashing/generate_hash_examples.py index 5edbef3f..f9e58e7f 100644 --- a/tests/test_hashing/generate_hash_examples.py +++ b/tests/test_hashing/generate_hash_examples.py @@ -3,8 +3,7 @@ # throughout the tests to ensure consistent hashing behavior across different runs # and revisions of the codebase. # -# Uses the new BaseSemanticHasher API (get_default_semantic_hasher) rather than -# the legacy hash_to_hex / hash_to_int / hash_to_uuid functions. +# Uses SemanticAwarePythonHasher via get_default_semantic_hasher. import json from collections import OrderedDict @@ -27,7 +26,7 @@ def generate_hash_examples(): - """Generate hash examples for various data structures using BaseSemanticHasher.""" + """Generate hash examples for various data structures using ``SemanticAwarePythonHasher``.""" hasher = get_default_semantic_hasher() examples = [] diff --git a/tests/test_hashing/test_hash_samples.py b/tests/test_hashing/test_hash_samples.py index 4caff744..b255f818 100644 --- a/tests/test_hashing/test_hash_samples.py +++ b/tests/test_hashing/test_hash_samples.py @@ -1,7 +1,7 @@ """ Tests for hash samples consistency. -Verifies that BaseSemanticHasher produces identical hashes across runs for a +Verifies that SemanticAwarePythonHasher produces identical hashes across runs for a fixed set of recorded input values. The sample file is generated (or regenerated) by running generate_hash_examples.py. From 596333b3941b15274e001430c5bd57a84134e128 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Thu, 25 Jun 2026 00:03:25 +0000 Subject: [PATCH 28/33] =?UTF-8?q?fix(context):=20rename=20function=5Finfo?= =?UTF-8?q?=5Fextractor=20=E2=86=92=20function=5Fsemantic=5Fhasher=20in=20?= =?UTF-8?q?v0.1=20config?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/orcapod/contexts/data/schemas/context_schema.json | 4 ++-- src/orcapod/contexts/data/v0.1.json | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/orcapod/contexts/data/schemas/context_schema.json b/src/orcapod/contexts/data/schemas/context_schema.json index 73f07dd4..366ce12f 100644 --- a/src/orcapod/contexts/data/schemas/context_schema.json +++ b/src/orcapod/contexts/data/schemas/context_schema.json @@ -61,9 +61,9 @@ "$ref": "#/$defs/objectspec", "description": "ObjectSpec for the file content hasher (used by PathHandler)" }, - "function_info_extractor": { + "function_semantic_hasher": { "$ref": "#/$defs/objectspec", - "description": "ObjectSpec for the function info extractor (used by FunctionHandler)" + "description": "ObjectSpec for the function semantic hasher (used by FunctionHandler)" }, "metadata": { "type": "object", diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json index 07e8e686..75da5243 100644 --- a/src/orcapod/contexts/data/v0.1.json +++ b/src/orcapod/contexts/data/v0.1.json @@ -50,7 +50,7 @@ } } }, - "function_info_extractor": { + "function_semantic_hasher": { "_class": "orcapod.hashing.semantic_hashing.function_info_extractors.FunctionSignatureExtractor", "_config": { "include_module": true, @@ -66,9 +66,9 @@ [{"_type": "pathlib.Path"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.PathHandler", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], [{"_type": "upath.core.UPath"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UPathHandler", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], [{"_type": "uuid.UUID"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UUIDHandler", "_config": {}}], - [{"_type": "types.FunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], - [{"_type": "types.BuiltinFunctionType"},{"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], - [{"_type": "types.MethodType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], + [{"_type": "types.FunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_semantic_hasher"}}}], + [{"_type": "types.BuiltinFunctionType"},{"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_semantic_hasher"}}}], + [{"_type": "types.MethodType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_semantic_hasher"}}}], [{"_type": "builtins.type"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.TypeObjectHandler", "_config": {}}], [{"_type": "types.GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasHandler", "_config": {}}], [{"_type": "types.UnionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UnionTypeHandler", "_config": {}}], From d71bf194fcb96bca8cb36b3a1dab03f3a5187331 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Thu, 25 Jun 2026 00:32:44 +0000 Subject: [PATCH 29/33] refactor(hashing): rename registry methods, add HandlerRegistryProtocol, decouple type annotations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - PythonTypeHandlerRegistry: rename get_semantic_hasher → get_handler, get_semantic_hasher_for_type → get_handler_for_type, has_semantic_hasher → has_handler; update all call sites - hashing_protocols: add HandlerRegistryProtocol abstracting over the concrete registry; SemanticHasherProtocol.type_handler_registry now returns HandlerRegistryProtocol instead of PythonTypeHandlerRegistry; PythonTypeHandler.handle() now uses SemanticHasherProtocol instead of the concrete SemanticAwarePythonHasher; remove concrete-class imports from TYPE_CHECKING block - versioned_hashers: type type_handler_registry param as HandlerRegistryProtocol | None instead of Any | None; drop unused Any import - Update test_hashing.py and test_semantic_hasher.py for renamed methods Co-Authored-By: Claude Sonnet 4.6 --- .../semantic_hashing/semantic_hasher.py | 10 ++-- .../semantic_hashing/type_handler_registry.py | 16 +++--- src/orcapod/hashing/versioned_hashers.py | 3 +- src/orcapod/hashing/visitors.py | 2 +- src/orcapod/protocols/hashing_protocols.py | 38 ++++++++++---- test-objective/unit/test_hashing.py | 42 ++++++++-------- tests/test_hashing/test_semantic_hasher.py | 50 +++++++++---------- 7 files changed, 89 insertions(+), 72 deletions(-) diff --git a/src/orcapod/hashing/semantic_hashing/semantic_hasher.py b/src/orcapod/hashing/semantic_hashing/semantic_hasher.py index a77b2750..ad895fdb 100644 --- a/src/orcapod/hashing/semantic_hashing/semantic_hasher.py +++ b/src/orcapod/hashing/semantic_hashing/semantic_hasher.py @@ -180,14 +180,14 @@ def hash_object( # Semantic hasher dispatch: handler returns a representative Python structure # (or a ContentHash as terminal); feed the result back into hash_object so # that returning a plain structure is equivalent to calling hash_object on it. - semantic_hasher = self._registry.get_semantic_hasher(obj) - if semantic_hasher is not None: + handler = self._registry.get_handler(obj) + if handler is not None: logger.debug( - "hash_object: dispatching %s to semantic hasher %s", + "hash_object: dispatching %s to handler %s", type(obj).__name__, - type(semantic_hasher).__name__, + type(handler).__name__, ) - result = semantic_hasher.handle(obj, self) + result = handler.handle(obj, self) return self.hash_object(result, resolver=resolver) # ContentIdentifiableProtocol: use resolver if provided, else content_hash(). diff --git a/src/orcapod/hashing/semantic_hashing/type_handler_registry.py b/src/orcapod/hashing/semantic_hashing/type_handler_registry.py index 1fcc46b9..84614dbd 100644 --- a/src/orcapod/hashing/semantic_hashing/type_handler_registry.py +++ b/src/orcapod/hashing/semantic_hashing/type_handler_registry.py @@ -90,11 +90,11 @@ def unregister(self, target_type: type) -> bool: return True return False - def get_semantic_hasher(self, obj: Any) -> "PythonTypeHandler | None": - """Look up the hasher for *obj* using MRO-aware resolution. + def get_handler(self, obj: Any) -> "PythonTypeHandler | None": + """Look up the handler for *obj* using MRO-aware resolution. Args: - obj: The object for which a hasher is needed. + obj: The object for which a handler is needed. Returns: The registered ``PythonTypeHandler``, or None. @@ -115,10 +115,10 @@ def get_semantic_hasher(self, obj: Any) -> "PythonTypeHandler | None": return handler return None - def get_semantic_hasher_for_type( + def get_handler_for_type( self, target_type: type ) -> "PythonTypeHandler | None": - """Look up the hasher for a *type object* (rather than an instance). + """Look up the handler for a *type object* (rather than an instance). Args: target_type: The type to look up. @@ -136,13 +136,13 @@ def get_semantic_hasher_for_type( return handler return None - def has_semantic_hasher(self, target_type: type) -> bool: - """Return True if a hasher is registered for *target_type* or any MRO ancestor. + def has_handler(self, target_type: type) -> bool: + """Return True if a handler is registered for *target_type* or any MRO ancestor. Args: target_type: The type to check. """ - return self.get_semantic_hasher_for_type(target_type) is not None + return self.get_handler_for_type(target_type) is not None def registered_types(self) -> list[type]: """Return a list of all directly-registered types (no MRO expansion).""" diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index 428e065b..33b5a7da 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -21,7 +21,6 @@ from __future__ import annotations import logging -from typing import Any from orcapod.protocols import hashing_protocols as hp @@ -49,7 +48,7 @@ def get_versioned_semantic_hasher( hasher_id: str = _CURRENT_SEMANTIC_HASHER_ID, strict: bool = True, - type_handler_registry: "Any | None" = None, + type_handler_registry: "hp.HandlerRegistryProtocol | None" = None, ) -> hp.SemanticHasherProtocol: """Return a SemanticAwarePythonHasher configured for the current version. diff --git a/src/orcapod/hashing/visitors.py b/src/orcapod/hashing/visitors.py index feeab8d6..a84be1c9 100644 --- a/src/orcapod/hashing/visitors.py +++ b/src/orcapod/hashing/visitors.py @@ -210,7 +210,7 @@ def visit_extension( return extension_type, storage_value # Only hash if a semantic hasher is registered for this Python type. - if not self._python_hasher.type_handler_registry.has_semantic_hasher( + if not self._python_hasher.type_handler_registry.has_handler( python_type ): return extension_type, storage_value diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index 33469fc9..119f8b3f 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -9,8 +9,6 @@ if TYPE_CHECKING: import pyarrow as pa - from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeHandlerRegistry - from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher @runtime_checkable @@ -53,27 +51,27 @@ class PythonTypeHandler(Protocol): """Protocol for type-specific semantic hashers used by SemanticAwarePythonHasher. A ``PythonTypeHandler`` converts a specific Python type into a - representative Python structure that ``SemanticAwarePythonHasher.hash_object()`` + representative Python structure that ``SemanticHasherProtocol.hash_object()`` can then hash. Implementations are registered with a - ``PythonTypeHandlerRegistry`` and looked up via MRO-aware resolution. + ``HandlerRegistryProtocol`` and looked up via MRO-aware resolution. - Each implementation receives the full ``SemanticAwarePythonHasher`` so it can + Each implementation receives the full ``SemanticHasherProtocol`` so it can delegate hashing of sub-values back to the outer hasher without coupling to a specific hasher instance. """ - def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: """Return a representative Python structure for *obj*. The returned value is passed back into - ``SemanticAwarePythonHasher.hash_object()`` for final hashing. Returning + ``SemanticHasherProtocol.hash_object()`` for final hashing. Returning a ``ContentHash`` short-circuits the process: the caller returns it as-is without re-hashing. This is useful for handlers that compute content-based hashes from external data (e.g. file content, Arrow tables). Args: obj: The object to hash. Always matches the registered type. - hasher: The active ``SemanticAwarePythonHasher``. Use + hasher: The active ``SemanticHasherProtocol``. Use ``hasher.hash_object(sub_value)`` to hash sub-values that require type-specific treatment. @@ -85,6 +83,26 @@ def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: ... +class HandlerRegistryProtocol(Protocol): + """Protocol for type handler registries used by ``SemanticHasherProtocol``. + + Abstracts over ``PythonTypeHandlerRegistry`` so that ``SemanticHasherProtocol`` + and its consumers do not depend on the concrete registry class. + """ + + def get_handler(self, obj: Any) -> "PythonTypeHandler | None": + """Look up the handler for *obj* using MRO-aware resolution.""" + ... + + def get_handler_for_type(self, target_type: type) -> "PythonTypeHandler | None": + """Look up the handler for a type object (rather than an instance).""" + ... + + def has_handler(self, target_type: type) -> bool: + """Return True if a handler is registered for *target_type* or any MRO ancestor.""" + ... + + class SemanticHasherProtocol(Protocol): """Protocol for the semantic content-based hasher.""" @@ -102,8 +120,8 @@ def hasher_id(self) -> str: ... @property - def type_handler_registry(self) -> "PythonTypeHandlerRegistry": - """Return the PythonTypeHandlerRegistry used by this hasher.""" + def type_handler_registry(self) -> HandlerRegistryProtocol: + """Return the handler registry used by this hasher.""" ... diff --git a/test-objective/unit/test_hashing.py b/test-objective/unit/test_hashing.py index 0ef408f4..e82083a3 100644 --- a/test-objective/unit/test_hashing.py +++ b/test-objective/unit/test_hashing.py @@ -271,22 +271,22 @@ def test_true_vs_one(self, hasher: SemanticAwarePythonHasher) -> None: # =================================================================== -# PythonTypeHandlerRegistry -- register/get_semantic_hasher roundtrip +# PythonTypeHandlerRegistry -- register/get_handler roundtrip # =================================================================== class TestPythonTypeHandlerRegistryBasics: - """register() + get_semantic_hasher() roundtrip.""" + """register() + get_handler() roundtrip.""" - def test_register_and_get_semantic_hasher(self, registry: PythonTypeHandlerRegistry) -> None: + def test_register_and_get_handler(self, registry: PythonTypeHandlerRegistry) -> None: handler = _FakeHandler() registry.register(int, handler) - assert registry.get_semantic_hasher(42) is handler + assert registry.get_handler(42) is handler - def test_get_semantic_hasher_returns_none_for_unregistered( + def test_get_handler_returns_none_for_unregistered( self, registry: PythonTypeHandlerRegistry ) -> None: - assert registry.get_semantic_hasher("hello") is None + assert registry.get_handler("hello") is None # =================================================================== @@ -308,7 +308,7 @@ class Child(Base): handler = _FakeHandler() registry.register(Base, handler) - assert registry.get_semantic_hasher(Child()) is handler + assert registry.get_handler(Child()) is handler def test_specific_handler_overrides_parent( self, registry: PythonTypeHandlerRegistry @@ -323,8 +323,8 @@ class Child(Base): child_handler = _FakeHandler("child") registry.register(Base, parent_handler) registry.register(Child, child_handler) - assert registry.get_semantic_hasher(Child()) is child_handler - assert registry.get_semantic_hasher(Base()) is parent_handler + assert registry.get_handler(Child()) is child_handler + assert registry.get_handler(Base()) is parent_handler # =================================================================== @@ -340,7 +340,7 @@ def test_unregister_existing(self, registry: PythonTypeHandlerRegistry) -> None: registry.register(int, handler) result = registry.unregister(int) assert result is True - assert registry.get_semantic_hasher(42) is None + assert registry.get_handler(42) is None def test_unregister_nonexistent(self, registry: PythonTypeHandlerRegistry) -> None: result = registry.unregister(float) @@ -348,21 +348,21 @@ def test_unregister_nonexistent(self, registry: PythonTypeHandlerRegistry) -> No # =================================================================== -# PythonTypeHandlerRegistry -- has_semantic_hasher +# PythonTypeHandlerRegistry -- has_handler # =================================================================== -class TestPythonTypeHandlerRegistryHasSemanticHasher: - """has_semantic_hasher() boolean check.""" +class TestPythonTypeHandlerRegistryHasHandler: + """has_handler() boolean check.""" - def test_has_semantic_hasher_true(self, registry: PythonTypeHandlerRegistry) -> None: + def test_has_handler_true(self, registry: PythonTypeHandlerRegistry) -> None: registry.register(int, _FakeHandler()) - assert registry.has_semantic_hasher(int) is True + assert registry.has_handler(int) is True - def test_has_semantic_hasher_false(self, registry: PythonTypeHandlerRegistry) -> None: - assert registry.has_semantic_hasher(str) is False + def test_has_handler_false(self, registry: PythonTypeHandlerRegistry) -> None: + assert registry.has_handler(str) is False - def test_has_semantic_hasher_via_mro(self, registry: PythonTypeHandlerRegistry) -> None: + def test_has_handler_via_mro(self, registry: PythonTypeHandlerRegistry) -> None: class Base: pass @@ -370,7 +370,7 @@ class Child(Base): pass registry.register(Base, _FakeHandler()) - assert registry.has_semantic_hasher(Child) is True + assert registry.has_handler(Child) is True # =================================================================== @@ -413,9 +413,9 @@ def register_types(start: int, count: int) -> None: def lookup_types() -> None: try: for _ in range(100): - registry.get_semantic_hasher(42) + registry.get_handler(42) registry.registered_types() - registry.has_semantic_hasher(int) + registry.has_handler(int) except Exception as exc: errors.append(exc) diff --git a/tests/test_hashing/test_semantic_hasher.py b/tests/test_hashing/test_semantic_hasher.py index bb1f8c12..dc074fcb 100644 --- a/tests/test_hashing/test_semantic_hasher.py +++ b/tests/test_hashing/test_semantic_hasher.py @@ -852,19 +852,19 @@ def test_register_and_get_exact(self): reg = PythonTypeHandlerRegistry() h = _DummySemanticHasher("base") reg.register(Base, h) - assert reg.get_semantic_hasher(Base()) is h + assert reg.get_handler(Base()) is h def test_mro_lookup_child(self): reg = PythonTypeHandlerRegistry() h = _DummySemanticHasher("base") reg.register(Base, h) - assert reg.get_semantic_hasher(Child()) is h + assert reg.get_handler(Child()) is h def test_mro_lookup_grandchild(self): reg = PythonTypeHandlerRegistry() h = _DummySemanticHasher("base") reg.register(Base, h) - assert reg.get_semantic_hasher(GrandChild()) is h + assert reg.get_handler(GrandChild()) is h def test_more_specific_handler_wins(self): reg = PythonTypeHandlerRegistry() @@ -872,19 +872,19 @@ def test_more_specific_handler_wins(self): h_child = _DummySemanticHasher("child") reg.register(Base, h_base) reg.register(Child, h_child) - assert reg.get_semantic_hasher(Child()) is h_child - assert reg.get_semantic_hasher(GrandChild()) is h_child + assert reg.get_handler(Child()) is h_child + assert reg.get_handler(GrandChild()) is h_child def test_unregistered_returns_none(self): reg = PythonTypeHandlerRegistry() - assert reg.get_semantic_hasher(Base()) is None + assert reg.get_handler(Base()) is None def test_unregister_removes_handler(self): reg = PythonTypeHandlerRegistry() h = _DummySemanticHasher("base") reg.register(Base, h) assert reg.unregister(Base) is True - assert reg.get_semantic_hasher(Base()) is None + assert reg.get_handler(Base()) is None def test_unregister_nonexistent_returns_false(self): reg = PythonTypeHandlerRegistry() @@ -896,7 +896,7 @@ def test_replace_existing_handler(self): h2 = _DummySemanticHasher("second") reg.register(Base, h1) reg.register(Base, h2) - assert reg.get_semantic_hasher(Base()) is h2 + assert reg.get_handler(Base()) is h2 def test_register_non_type_raises(self): reg = PythonTypeHandlerRegistry() @@ -906,16 +906,16 @@ def test_register_non_type_raises(self): def test_has_handler_exact(self): reg = PythonTypeHandlerRegistry() reg.register(Base, _DummySemanticHasher("b")) - assert reg.has_semantic_hasher(Base) is True + assert reg.has_handler(Base) is True def test_has_handler_via_mro(self): reg = PythonTypeHandlerRegistry() reg.register(Base, _DummySemanticHasher("b")) - assert reg.has_semantic_hasher(Child) is True + assert reg.has_handler(Child) is True def test_has_handler_false(self): reg = PythonTypeHandlerRegistry() - assert reg.has_semantic_hasher(Base) is False + assert reg.has_handler(Base) is False def test_registered_types_snapshot(self): reg = PythonTypeHandlerRegistry() @@ -937,9 +937,9 @@ def test_get_handler_for_type(self): reg = PythonTypeHandlerRegistry() h = _DummySemanticHasher("b") reg.register(Base, h) - assert reg.get_semantic_hasher_for_type(Base) is h - assert reg.get_semantic_hasher_for_type(Child) is h # via MRO - assert reg.get_semantic_hasher_for_type(int) is None + assert reg.get_handler_for_type(Base) is h + assert reg.get_handler_for_type(Child) is h # via MRO + assert reg.get_handler_for_type(int) is None # --------------------------------------------------------------------------- @@ -1077,21 +1077,21 @@ def test_default_registry_has_builtin_handlers(self): import typing as _typing reg = get_default_python_type_handler_registry() - assert reg.has_semantic_hasher(bytes) - assert reg.has_semantic_hasher(bytearray) - assert reg.has_semantic_hasher(UUID) - assert reg.has_semantic_hasher(Path) - assert reg.has_semantic_hasher(_types.FunctionType) - assert reg.has_semantic_hasher(type) - assert reg.has_semantic_hasher(_types.GenericAlias) - assert reg.has_semantic_hasher(_types.UnionType) - assert reg.has_semantic_hasher(_typing._GenericAlias) # type: ignore[attr-defined] - assert reg.has_semantic_hasher(_typing._SpecialForm) # type: ignore[attr-defined] + assert reg.has_handler(bytes) + assert reg.has_handler(bytearray) + assert reg.has_handler(UUID) + assert reg.has_handler(Path) + assert reg.has_handler(_types.FunctionType) + assert reg.has_handler(type) + assert reg.has_handler(_types.GenericAlias) + assert reg.has_handler(_types.UnionType) + assert reg.has_handler(_typing._GenericAlias) # type: ignore[attr-defined] + assert reg.has_handler(_typing._SpecialForm) # type: ignore[attr-defined] def test_default_registry_has_no_content_hash_handler(self): """ContentHash is handled as a terminal -- no registry entry needed.""" reg = get_default_python_type_handler_registry() - assert not reg.has_semantic_hasher(ContentHash) + assert not reg.has_handler(ContentHash) def test_default_hasher_can_hash_common_types(self): h = get_default_semantic_hasher() From e8129d7c870c159d580c066a6d3c01f9ce8456ea Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Thu, 25 Jun 2026 00:55:26 +0000 Subject: [PATCH 30/33] refactor(hashing): enforce Protocol naming convention and decouple concrete types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename PythonTypeHandler → PythonTypeHandlerProtocol everywhere: class definition in hashing_protocols.py, all type annotations in type_handler_registry.py, hashing/__init__.py export, and all docstring references across builtin_handlers.py and semantic_hashing/__init__.py - Rename CallableWithPod → CallableWithPodProtocol in function_pod.py - SemanticAwarePythonHasher.__init__ now accepts HandlerRegistryProtocol | None instead of PythonTypeHandlerRegistry | None; drop concrete-class import - SemanticAwarePythonHasher.type_handler_registry property now returns HandlerRegistryProtocol instead of PythonTypeHandlerRegistry - ContentIdentifiableMixin now imports and uses SemanticHasherProtocol instead of the concrete SemanticAwarePythonHasher for __init__ param and _get_hasher return type - Update strict-mode error messages to say "no implementation of PythonTypeHandlerProtocol registered"; update matching test assertions Co-Authored-By: Claude Sonnet 4.6 --- src/orcapod/core/function_pod.py | 8 +++--- src/orcapod/hashing/__init__.py | 6 ++--- .../hashing/semantic_hashing/__init__.py | 4 +-- .../semantic_hashing/builtin_handlers.py | 2 +- .../content_identifiable_mixin.py | 20 +++++++------- .../semantic_hashing/semantic_hasher.py | 26 +++++++++---------- .../semantic_hashing/type_handler_registry.py | 24 ++++++++--------- src/orcapod/protocols/hashing_protocols.py | 10 +++---- test-objective/unit/test_hashing.py | 4 +-- tests/test_hashing/test_semantic_hasher.py | 2 +- 10 files changed, 53 insertions(+), 53 deletions(-) diff --git a/src/orcapod/core/function_pod.py b/src/orcapod/core/function_pod.py index ac4dd854..be2fee48 100644 --- a/src/orcapod/core/function_pod.py +++ b/src/orcapod/core/function_pod.py @@ -656,7 +656,7 @@ def as_table( return output_table -class CallableWithPod(Protocol): +class CallableWithPodProtocol(Protocol): @property def pod(self) -> _FunctionPodBase: """Return the associated function pod.""" @@ -676,7 +676,7 @@ def function_pod( pod_cache_database: ArrowDatabaseProtocol | None = None, executor: DataFunctionExecutorProtocol | None = None, **kwargs, -) -> Callable[..., CallableWithPod]: +) -> Callable[..., CallableWithPodProtocol]: """Decorator that attaches a ``FunctionPod`` as a ``pod`` attribute. Args: @@ -696,7 +696,7 @@ def function_pod( A decorator that adds a ``pod`` attribute to the wrapped function. """ - def decorator(func: Callable) -> CallableWithPod: + def decorator(func: Callable) -> CallableWithPodProtocol: if func.__name__ == "": raise ValueError("Lambda functions cannot be used with function_pod") @@ -736,7 +736,7 @@ def wrapper(*args, **kwargs): return func(*args, **kwargs) setattr(wrapper, "pod", pod) - return cast(CallableWithPod, wrapper) + return cast(CallableWithPodProtocol, wrapper) return decorator diff --git a/src/orcapod/hashing/__init__.py b/src/orcapod/hashing/__init__.py index 658180a0..5c4ddc1f 100644 --- a/src/orcapod/hashing/__init__.py +++ b/src/orcapod/hashing/__init__.py @@ -5,7 +5,7 @@ ---------- SemanticAwarePythonHasher -- content-based recursive object hasher SemanticHasherProtocol -- protocol for semantic hashers - PythonTypeHandlerRegistry -- registry mapping types to PythonTypeHandler instances + PythonTypeHandlerRegistry -- registry mapping types to PythonTypeHandlerProtocol instances get_default_semantic_hasher -- global default SemanticHasherProtocol factory get_default_python_type_handler_registry -- global default registry factory ContentIdentifiableMixin -- convenience mixin for content-identifiable objects @@ -53,7 +53,7 @@ ContentIdentifiableProtocol, FileContentHasherProtocol, FunctionInfoExtractorProtocol, - PythonTypeHandler, + PythonTypeHandlerProtocol, SemanticHasherProtocol, SemanticTypeHasherProtocol, StringCacherProtocol, @@ -97,7 +97,7 @@ "register_builtin_python_type_handlers", "SemanticHasherProtocol", "ContentIdentifiableProtocol", - "PythonTypeHandler", + "PythonTypeHandlerProtocol", "FileContentHasherProtocol", "ArrowHasherProtocol", "StringCacherProtocol", diff --git a/src/orcapod/hashing/semantic_hashing/__init__.py b/src/orcapod/hashing/semantic_hashing/__init__.py index 67d4bd64..c8d139b3 100644 --- a/src/orcapod/hashing/semantic_hashing/__init__.py +++ b/src/orcapod/hashing/semantic_hashing/__init__.py @@ -2,11 +2,11 @@ orcapod.hashing.semantic_hashing ================================= SemanticAwarePythonHasher -- content-based recursive object hasher - PythonTypeHandlerRegistry -- MRO-aware registry mapping types → PythonTypeHandler + PythonTypeHandlerRegistry -- MRO-aware registry mapping types → PythonTypeHandlerProtocol BuiltinPythonTypeHandlerRegistry -- pre-populated registry with built-in hashers ContentIdentifiableMixin -- convenience mixin for content-identifiable objects -Built-in PythonTypeHandler implementations: +Built-in PythonTypeHandlerProtocol implementations: PathHandler -- pathlib.Path → file-content hash UUIDHandler -- uuid.UUID → canonical bytes BytesHandler -- bytes/bytearray → hex string diff --git a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py index 469a1fe5..096a6cdc 100644 --- a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py +++ b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py @@ -1,5 +1,5 @@ """ -Built-in PythonTypeHandler implementations. +Built-in PythonTypeHandlerProtocol implementations. PathHandler -- pathlib.Path: file content hash UPathHandler -- upath.UPath: file content hash (remote-aware) diff --git a/src/orcapod/hashing/semantic_hashing/content_identifiable_mixin.py b/src/orcapod/hashing/semantic_hashing/content_identifiable_mixin.py index effa94ad..703cba8d 100644 --- a/src/orcapod/hashing/semantic_hashing/content_identifiable_mixin.py +++ b/src/orcapod/hashing/semantic_hashing/content_identifiable_mixin.py @@ -3,13 +3,13 @@ Any class that implements ``identity_structure()`` can inherit from this mixin to gain a full suite of content-based identity helpers without having to wire -up a ``SemanticAwarePythonHasher`` manually: +up a ``SemanticHasherProtocol`` manually: - ``content_hash()`` -- returns a stable ContentHash for the object - ``__hash__()`` -- Python hash based on content (int) - ``__eq__()`` -- equality via content_hash comparison -The mixin uses the global default ``SemanticAwarePythonHasher`` by default, but +The mixin uses the global default ``SemanticHasherProtocol`` by default, but accepts an injected hasher for testing or custom configurations. Usage @@ -32,7 +32,7 @@ def identity_structure(self): With an injected hasher (e.g. in tests):: - hasher = SemanticAwarePythonHasher(hasher_id="test", strict=True) + hasher = SemanticHasherProtocol(hasher_id="test", strict=True) record = MyRecord("foo", 42) record._semantic_hasher = hasher print(record.content_hash()) @@ -65,7 +65,7 @@ def identity_structure(self): import logging from typing import Any -from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher +from orcapod.protocols.hashing_protocols import SemanticHasherProtocol from orcapod.types import ContentHash logger = logging.getLogger(__name__) @@ -82,19 +82,19 @@ def identity_structure(self) -> Any: ... The returned structure is recursively resolved and hashed by the - ``SemanticAwarePythonHasher`` to produce a stable ContentHash. + ``SemanticHasherProtocol`` to produce a stable ContentHash. Parameters (passed as keyword arguments to ``__init__``) --------------------------------------------------------- semantic_hasher: - Optional ``SemanticAwarePythonHasher`` instance to use. When omitted, + Optional ``SemanticHasherProtocol`` instance to use. When omitted, the hasher is obtained from the default data context via ``orcapod.contexts.get_default_context().semantic_hasher``, which is the single source of truth for versioned component configuration. """ def __init__( - self, *, semantic_hasher: SemanticAwarePythonHasher | None = None, **kwargs: Any + self, *, semantic_hasher: SemanticHasherProtocol | None = None, **kwargs: Any ) -> None: # Cooperative MRO-friendly init -- forward remaining kwargs up the chain. super().__init__(**kwargs) @@ -215,8 +215,8 @@ def _invalidate_content_hash_cache(self) -> None: # Hasher resolution # ------------------------------------------------------------------ - def _get_hasher(self) -> SemanticAwarePythonHasher: - """Return the ``SemanticAwarePythonHasher`` to use for this object. + def _get_hasher(self) -> SemanticHasherProtocol: + """Return the ``SemanticHasherProtocol`` to use for this object. Resolution order: 1. The instance-level ``_semantic_hasher`` attribute (set at @@ -229,7 +229,7 @@ def _get_hasher(self) -> SemanticAwarePythonHasher: type converter, etc.) that belong to the same context. Returns: - SemanticAwarePythonHasher: The hasher to use. + SemanticHasherProtocol: The hasher to use. """ if self._semantic_hasher is not None: return self._semantic_hasher diff --git a/src/orcapod/hashing/semantic_hashing/semantic_hasher.py b/src/orcapod/hashing/semantic_hashing/semantic_hasher.py index ad895fdb..2235037c 100644 --- a/src/orcapod/hashing/semantic_hashing/semantic_hasher.py +++ b/src/orcapod/hashing/semantic_hashing/semantic_hasher.py @@ -71,7 +71,6 @@ from collections.abc import Callable, Mapping from typing import Any -from orcapod.hashing.semantic_hashing.type_handler_registry import PythonTypeHandlerRegistry from orcapod.protocols import hashing_protocols as hp from orcapod.types import ContentHash @@ -91,8 +90,8 @@ class SemanticAwarePythonHasher: A short string identifying this hasher version/configuration. Embedded in every ContentHash produced. type_handler_registry: - ``PythonTypeHandlerRegistry`` for MRO-aware lookup of - ``PythonTypeHandler`` instances. + ``HandlerRegistryProtocol`` for MRO-aware lookup of + ``PythonTypeHandlerProtocol`` instances. If None, the default registry is used. strict: When True (default) raises TypeError for unhandled types. @@ -102,7 +101,7 @@ class SemanticAwarePythonHasher: def __init__( self, hasher_id: str, - type_handler_registry: PythonTypeHandlerRegistry | None = None, + type_handler_registry: "hp.HandlerRegistryProtocol | None" = None, strict: bool = True, ) -> None: self._hasher_id = hasher_id @@ -127,8 +126,8 @@ def strict(self) -> bool: return self._strict @property - def type_handler_registry(self) -> PythonTypeHandlerRegistry: - """Return the ``PythonTypeHandlerRegistry`` used by this hasher.""" + def type_handler_registry(self) -> "hp.HandlerRegistryProtocol": + """Return the ``HandlerRegistryProtocol`` used by this hasher.""" return self._registry def hash_object( @@ -366,7 +365,7 @@ def _hash_to_content_hash(self, obj: Any) -> ContentHash: except (TypeError, ValueError) as exc: raise TypeError( f"SemanticAwarePythonHasher: failed to JSON-serialise object of type " - f"{type(obj).__name__!r}. Ensure all PythonTypeHandler " + f"{type(obj).__name__!r}. Ensure all PythonTypeHandlerProtocol " "implementations and identity_structure() return JSON-serialisable " "primitives or structures." ) from exc @@ -389,15 +388,16 @@ def _handle_unknown(self, obj: Any) -> str: if self._strict: raise TypeError( - f"SemanticAwarePythonHasher (strict): no PythonTypeHandler " - f"registered for type '{qualified}' and it does not implement " - "ContentIdentifiableProtocol. Register a PythonTypeHandler " - "via the PythonTypeHandlerRegistry or implement " - "identity_structure() on the class." + f"SemanticAwarePythonHasher (strict): no implementation of " + f"PythonTypeHandlerProtocol registered for type '{qualified}' and it " + "does not implement ContentIdentifiableProtocol. Register an " + "implementation of PythonTypeHandlerProtocol via the " + "HandlerRegistryProtocol or implement identity_structure() on the class." ) logger.warning( - "SemanticAwarePythonHasher (non-strict): no PythonTypeHandler registered for type '%s'. " + "SemanticAwarePythonHasher (non-strict): no implementation of " + "PythonTypeHandlerProtocol registered for type '%s'. " "Falling back to best-effort string representation.", qualified, ) diff --git a/src/orcapod/hashing/semantic_hashing/type_handler_registry.py b/src/orcapod/hashing/semantic_hashing/type_handler_registry.py index 84614dbd..6389b501 100644 --- a/src/orcapod/hashing/semantic_hashing/type_handler_registry.py +++ b/src/orcapod/hashing/semantic_hashing/type_handler_registry.py @@ -1,7 +1,7 @@ """ -PythonTypeHandlerRegistry — MRO-aware registry for PythonTypeHandler instances. +PythonTypeHandlerRegistry — MRO-aware registry for PythonTypeHandlerProtocol instances. -``PythonTypeHandler`` is the protocol for type-specific handlers; this registry +``PythonTypeHandlerProtocol`` is the protocol for type-specific handlers; this registry provides MRO-aware lookup so subclasses inherit their parent's handler. """ @@ -14,14 +14,14 @@ if TYPE_CHECKING: from orcapod.protocols.hashing_protocols import ( ArrowHasherProtocol, - PythonTypeHandler, + PythonTypeHandlerProtocol, ) logger = logging.getLogger(__name__) class PythonTypeHandlerRegistry: - """Registry mapping Python types to PythonTypeHandler instances. + """Registry mapping Python types to PythonTypeHandlerProtocol instances. Lookup is MRO-aware: when no hasher is registered for the exact type of an object, the registry walks the object's MRO (most-derived first) until @@ -34,20 +34,20 @@ class PythonTypeHandlerRegistry: """ def __init__( - self, handlers: list[tuple[type, "PythonTypeHandler"]] | None = None + self, handlers: list[tuple[type, "PythonTypeHandlerProtocol"]] | None = None ) -> None: """ Args: handlers: Optional list of ``(target_type, hasher)`` pairs to register at construction time. """ - self._handlers: dict[type, "PythonTypeHandler"] = {} + self._handlers: dict[type, "PythonTypeHandlerProtocol"] = {} self._lock = threading.RLock() if handlers: for target_type, handler in handlers: self.register(target_type, handler) - def register(self, target_type: type, handler: "PythonTypeHandler") -> None: + def register(self, target_type: type, handler: "PythonTypeHandlerProtocol") -> None: """Register a hasher for a specific Python type. If a hasher is already registered for *target_type*, it is silently @@ -55,7 +55,7 @@ def register(self, target_type: type, handler: "PythonTypeHandler") -> None: Args: target_type: The Python type (or class) for which the hasher should be used. - handler: A ``PythonTypeHandler`` instance. + handler: A ``PythonTypeHandlerProtocol`` instance. Raises: TypeError: If ``target_type`` is not a ``type``. @@ -90,14 +90,14 @@ def unregister(self, target_type: type) -> bool: return True return False - def get_handler(self, obj: Any) -> "PythonTypeHandler | None": + def get_handler(self, obj: Any) -> "PythonTypeHandlerProtocol | None": """Look up the handler for *obj* using MRO-aware resolution. Args: obj: The object for which a handler is needed. Returns: - The registered ``PythonTypeHandler``, or None. + The registered ``PythonTypeHandlerProtocol``, or None. """ obj_type = type(obj) with self._lock: @@ -117,14 +117,14 @@ def get_handler(self, obj: Any) -> "PythonTypeHandler | None": def get_handler_for_type( self, target_type: type - ) -> "PythonTypeHandler | None": + ) -> "PythonTypeHandlerProtocol | None": """Look up the handler for a *type object* (rather than an instance). Args: target_type: The type to look up. Returns: - The registered ``PythonTypeHandler``, or None. + The registered ``PythonTypeHandlerProtocol``, or None. """ with self._lock: handler = self._handlers.get(target_type) diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index 119f8b3f..4599847f 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -47,10 +47,10 @@ def content_hash(self, hasher: "SemanticHasherProtocol | None" = None) -> Conten ... -class PythonTypeHandler(Protocol): - """Protocol for type-specific semantic hashers used by SemanticAwarePythonHasher. +class PythonTypeHandlerProtocol(Protocol): + """Protocol for type-specific semantic hashers used by ``SemanticAwarePythonHasher``. - A ``PythonTypeHandler`` converts a specific Python type into a + A ``PythonTypeHandlerProtocol`` converts a specific Python type into a representative Python structure that ``SemanticHasherProtocol.hash_object()`` can then hash. Implementations are registered with a ``HandlerRegistryProtocol`` and looked up via MRO-aware resolution. @@ -90,11 +90,11 @@ class HandlerRegistryProtocol(Protocol): and its consumers do not depend on the concrete registry class. """ - def get_handler(self, obj: Any) -> "PythonTypeHandler | None": + def get_handler(self, obj: Any) -> "PythonTypeHandlerProtocol | None": """Look up the handler for *obj* using MRO-aware resolution.""" ... - def get_handler_for_type(self, target_type: type) -> "PythonTypeHandler | None": + def get_handler_for_type(self, target_type: type) -> "PythonTypeHandlerProtocol | None": """Look up the handler for a type object (rather than an instance).""" ... diff --git a/test-objective/unit/test_hashing.py b/test-objective/unit/test_hashing.py index e82083a3..a6928ef3 100644 --- a/test-objective/unit/test_hashing.py +++ b/test-objective/unit/test_hashing.py @@ -58,7 +58,7 @@ def lenient_hasher(registry: PythonTypeHandlerRegistry) -> SemanticAwarePythonHa class _FakeHandler: - """Minimal object satisfying PythonTypeHandler for testing.""" + """Minimal object satisfying PythonTypeHandlerProtocol for testing.""" def __init__(self, return_value: Any = "handled") -> None: self._return_value = return_value @@ -230,7 +230,7 @@ def test_unknown_type_strict_raises(self, hasher: SemanticAwarePythonHasher) -> class Unknown: pass - with pytest.raises(TypeError, match="no PythonTypeHandler registered"): + with pytest.raises(TypeError, match="no implementation of PythonTypeHandlerProtocol registered"): hasher.hash_object(Unknown()) def test_unknown_type_lenient_succeeds( diff --git a/tests/test_hashing/test_semantic_hasher.py b/tests/test_hashing/test_semantic_hasher.py index dc074fcb..3fe4fd38 100644 --- a/tests/test_hashing/test_semantic_hasher.py +++ b/tests/test_hashing/test_semantic_hasher.py @@ -299,7 +299,7 @@ def __init__(self, x: int) -> None: class TestStrictMode: def test_strict_raises_on_unknown_type(self, hasher): - with pytest.raises(TypeError, match="no PythonTypeHandler registered"): + with pytest.raises(TypeError, match="no implementation of PythonTypeHandlerProtocol registered"): hasher.hash_object(Unhandled(1)) def test_non_strict_returns_content_hash(self, lenient_hasher): From 7caa8af2d1a0125460ef73421c4b60984041d937 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Thu, 25 Jun 2026 01:36:00 +0000 Subject: [PATCH 31/33] refactor(hashing): decouple builtin handlers from concrete types - Replace all `handle()` hasher params with SemanticHasherProtocol (was SemanticAwarePythonHasher) across all 11 builtin handler classes - Change register_builtin_python_type_handlers() registry param from PythonTypeHandlerRegistry to HandlerRegistryProtocol - Remove concrete-class imports from TYPE_CHECKING block; import SemanticHasherProtocol and HandlerRegistryProtocol from protocols module - Fix content_identifiable_mixin.py docstring example that incorrectly showed SemanticHasherProtocol being instantiated; replace with SemanticAwarePythonHasher (the concrete class) Co-Authored-By: Claude Sonnet 4.6 --- .../semantic_hashing/builtin_handlers.py | 32 +++++++++---------- .../content_identifiable_mixin.py | 3 +- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py index 096a6cdc..68bbb3ec 100644 --- a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py +++ b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py @@ -29,13 +29,11 @@ from orcapod.types import ContentHash, PathLike, Schema if TYPE_CHECKING: - from orcapod.hashing.semantic_hashing.type_handler_registry import ( - PythonTypeHandlerRegistry, - ) - from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher from orcapod.protocols.hashing_protocols import ( ArrowHasherProtocol, FileContentHasherProtocol, + HandlerRegistryProtocol, + SemanticHasherProtocol, ) logger = logging.getLogger(__name__) @@ -51,7 +49,7 @@ class PathHandler: def __init__(self, file_hasher: "FileContentHasherProtocol") -> None: self.file_hasher = file_hasher - def handle(self, obj: PathLike, hasher: "SemanticAwarePythonHasher") -> ContentHash: + def handle(self, obj: PathLike, hasher: "SemanticHasherProtocol") -> ContentHash: path: Path = Path(obj) if not path.exists(): raise FileNotFoundError( @@ -77,7 +75,7 @@ class UPathHandler: def __init__(self, file_hasher: "FileContentHasherProtocol") -> None: self.file_hasher = file_hasher - def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> ContentHash: if not isinstance(obj, UPath): raise TypeError( f"UPathHandler: expected a UPath, got {type(obj)!r}." @@ -97,14 +95,14 @@ def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: class UUIDHandler: """Hasher for ``uuid.UUID`` objects — returns the raw 16-byte binary representation.""" - def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: return obj.bytes class BytesHandler: """Hasher for bytes and bytearray objects — returns the lowercase hex string.""" - def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: if isinstance(obj, (bytes, bytearray)): return obj.hex() raise TypeError( @@ -123,7 +121,7 @@ class FunctionHandler: def __init__(self, function_info_extractor: Any) -> None: self.function_info_extractor = function_info_extractor - def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: if not (callable(obj) and hasattr(obj, "__code__")): raise TypeError( f"FunctionHandler: expected a callable with __code__, got {type(obj)!r}" @@ -140,7 +138,7 @@ class TypeObjectHandler: Returns a stable string of the form ``"type:."``. """ - def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: if not isinstance(obj, type): raise TypeError( f"TypeObjectHandler: expected a type/class, got {type(obj)!r}" @@ -153,7 +151,7 @@ def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: class SpecialFormHandler: """Hasher for ``typing._SpecialForm`` objects such as ``typing.Union``.""" - def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: name = getattr(obj, "_name", None) or repr(obj) return f"special_form:typing.{name}" @@ -161,7 +159,7 @@ def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: class GenericAliasHandler: """Hasher for generic alias type annotations (``dict[int, str]``, ``Optional[X]``, etc.).""" - def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: import typing origin = getattr(obj, "__origin__", None) @@ -181,7 +179,7 @@ def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: class UnionTypeHandler: """Hasher for ``types.UnionType`` objects (Python 3.10+ ``X | Y`` syntax).""" - def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: args = getattr(obj, "__args__", None) or () hashed_args = sorted(hasher.hash_object(arg).to_string() for arg in args) return {"__type__": "union", "args": hashed_args} @@ -206,7 +204,7 @@ def _get_arrow_hasher(self) -> "ArrowHasherProtocol": from orcapod.contexts import get_default_context return get_default_context().arrow_hasher # type: ignore[return-value] - def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: + def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> ContentHash: import pyarrow as _pa if isinstance(obj, _pa.RecordBatch): @@ -221,7 +219,7 @@ def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> ContentHash: class SchemaHandler: """Hasher for ``Schema`` objects.""" - def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: + def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: if not isinstance(obj, Schema): raise TypeError( f"SchemaHandler: expected a Schema, got {type(obj)!r}" @@ -230,7 +228,7 @@ def handle(self, obj: Any, hasher: "SemanticAwarePythonHasher") -> Any: def register_builtin_python_type_handlers( - registry: "PythonTypeHandlerRegistry", + registry: "HandlerRegistryProtocol", file_hasher: Any = None, function_info_extractor: Any = None, arrow_hasher: "ArrowHasherProtocol | None" = None, @@ -244,7 +242,7 @@ def register_builtin_python_type_handlers( hash time, breaking the construction-time circular dependency. Args: - registry: The ``PythonTypeHandlerRegistry`` to populate. + registry: The ``HandlerRegistryProtocol`` instance to populate. file_hasher: Optional ``FileContentHasherProtocol`` for path hashing. Defaults to ``BasicFileHasher(sha256)``. function_info_extractor: Optional ``FunctionInfoExtractorProtocol``. diff --git a/src/orcapod/hashing/semantic_hashing/content_identifiable_mixin.py b/src/orcapod/hashing/semantic_hashing/content_identifiable_mixin.py index 703cba8d..4543ff01 100644 --- a/src/orcapod/hashing/semantic_hashing/content_identifiable_mixin.py +++ b/src/orcapod/hashing/semantic_hashing/content_identifiable_mixin.py @@ -32,7 +32,8 @@ def identity_structure(self): With an injected hasher (e.g. in tests):: - hasher = SemanticHasherProtocol(hasher_id="test", strict=True) + from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher + hasher = SemanticAwarePythonHasher(hasher_id="test", strict=True) record = MyRecord("foo", 42) record._semantic_hasher = hasher print(record.content_hash()) From 5c57b719f18ab80e4828cd75351a439b20daee0e Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Thu, 25 Jun 2026 02:57:22 +0000 Subject: [PATCH 32/33] fix(hashing): complete HandlerRegistryProtocol and fix test docstring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add register() and __len__() to HandlerRegistryProtocol so the protocol matches every method called on registry inside register_builtin_python_type_handlers(); previously HandlerRegistryProtocol only declared the lookup side of the interface (get_handler, get_handler_for_type, has_handler), leaving register() and len() untyped - Fix test_uuid_handler.py module docstring: s/hash() method behaviour/ handle() dispatch via SemanticAwarePythonHasher/ — UUIDHandler implements handle(), not hash(), and the tests exercise hash_object() dispatch Co-Authored-By: Claude Sonnet 4.6 --- src/orcapod/protocols/hashing_protocols.py | 8 ++++++++ tests/test_hashing/test_uuid_handler.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index 4599847f..a5e066d4 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -90,6 +90,10 @@ class HandlerRegistryProtocol(Protocol): and its consumers do not depend on the concrete registry class. """ + def register(self, target_type: type, handler: "PythonTypeHandlerProtocol") -> None: + """Register a handler for a specific Python type.""" + ... + def get_handler(self, obj: Any) -> "PythonTypeHandlerProtocol | None": """Look up the handler for *obj* using MRO-aware resolution.""" ... @@ -102,6 +106,10 @@ def has_handler(self, target_type: type) -> bool: """Return True if a handler is registered for *target_type* or any MRO ancestor.""" ... + def __len__(self) -> int: + """Return the number of directly-registered types.""" + ... + class SemanticHasherProtocol(Protocol): """Protocol for the semantic content-based hasher.""" diff --git a/tests/test_hashing/test_uuid_handler.py b/tests/test_hashing/test_uuid_handler.py index b9e57cd9..a4692510 100644 --- a/tests/test_hashing/test_uuid_handler.py +++ b/tests/test_hashing/test_uuid_handler.py @@ -1,4 +1,4 @@ -"""Tests for UUIDHandler hash() method behaviour. +"""Tests for UUIDHandler handle() dispatch via SemanticAwarePythonHasher. Verifies that UUIDHandler produces a ContentHash based on the 16-byte binary representation of a UUID, consistent with OrcaPod's canonical From b63ff2df21d9219560bf8d7c51f7e70ebc513c55 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Thu, 25 Jun 2026 03:21:20 +0000 Subject: [PATCH 33/33] docs(hashing): align docstrings with protocol parameter/return types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update all docstrings and type annotations to consistently use protocol types instead of concrete implementation types: - versioned_hashers.py: fix summary ("SemanticAwarePythonHasher" → "SemanticHasherProtocol") and type_handler_registry param description ("PythonTypeHandlerRegistry" → "HandlerRegistryProtocol") to match the hp.HandlerRegistryProtocol annotation already on the parameter - arrow_hashers.py: change semantic_hasher param annotation and docstring from SemanticAwarePythonHasher to SemanticHasherProtocol; update TYPE_CHECKING import accordingly - visitors.py: same for python_hasher param in SemanticHashingVisitor - defaults.py: update "owned by SemanticAwarePythonHasher" to "owned by SemanticHasherProtocol" in get_default_python_type_handler_registry Co-Authored-By: Claude Sonnet 4.6 --- src/orcapod/hashing/arrow_hashers.py | 6 +++--- src/orcapod/hashing/defaults.py | 2 +- src/orcapod/hashing/versioned_hashers.py | 4 ++-- src/orcapod/hashing/visitors.py | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index f568cac1..d5ce6a7c 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -11,7 +11,7 @@ if TYPE_CHECKING: from orcapod.semantic_types.universal_converter import UniversalTypeConverter - from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher + from orcapod.protocols.hashing_protocols import SemanticHasherProtocol class StarfixArrowHasher: @@ -35,7 +35,7 @@ class StarfixArrowHasher: ``UniversalTypeConverter`` used to resolve extension types to Python types and convert storage values back to Python objects. semantic_hasher: - ``SemanticAwarePythonHasher`` used to hash Python objects extracted + ``SemanticHasherProtocol`` used to hash Python objects extracted from extension-typed columns. hasher_id: String identifier embedded in every ``ContentHash`` produced by this @@ -45,7 +45,7 @@ class StarfixArrowHasher: def __init__( self, type_converter: "UniversalTypeConverter", - semantic_hasher: "SemanticAwarePythonHasher", + semantic_hasher: "SemanticHasherProtocol", hasher_id: str, ) -> None: self._type_converter = type_converter diff --git a/src/orcapod/hashing/defaults.py b/src/orcapod/hashing/defaults.py index fb95675b..26a6ac44 100644 --- a/src/orcapod/hashing/defaults.py +++ b/src/orcapod/hashing/defaults.py @@ -19,7 +19,7 @@ def get_default_python_type_handler_registry() -> PythonTypeHandlerRegistry: Return the ``PythonTypeHandlerRegistry`` from the default data context's semantic hasher. - The registry is owned by the active ``SemanticAwarePythonHasher``, which is itself + The registry is owned by the active ``SemanticHasherProtocol``, which is itself versioned inside the active ``DataContext``. Returns: diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index 33b5a7da..c968bbca 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -50,7 +50,7 @@ def get_versioned_semantic_hasher( strict: bool = True, type_handler_registry: "hp.HandlerRegistryProtocol | None" = None, ) -> hp.SemanticHasherProtocol: - """Return a SemanticAwarePythonHasher configured for the current version. + """Return a SemanticHasherProtocol configured for the current version. Parameters ---------- @@ -60,7 +60,7 @@ def get_versioned_semantic_hasher( When True raises TypeError for unhandled types. When False falls back to a best-effort string representation. type_handler_registry: - Optional ``PythonTypeHandlerRegistry`` to inject. When None the + Optional ``HandlerRegistryProtocol`` to inject. When None the global default registry is used. """ from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher diff --git a/src/orcapod/hashing/visitors.py b/src/orcapod/hashing/visitors.py index a84be1c9..ec0382ac 100644 --- a/src/orcapod/hashing/visitors.py +++ b/src/orcapod/hashing/visitors.py @@ -13,7 +13,7 @@ if TYPE_CHECKING: import pyarrow as pa from orcapod.semantic_types.universal_converter import UniversalTypeConverter - from orcapod.hashing.semantic_hashing.semantic_hasher import SemanticAwarePythonHasher + from orcapod.protocols.hashing_protocols import SemanticHasherProtocol else: pa = LazyModule("pyarrow") @@ -180,14 +180,14 @@ class SemanticHashingVisitor(ArrowTypeDataVisitor): Args: type_converter: The active ``UniversalTypeConverter`` for resolving extension type → Python type and storage → Python conversion. - python_hasher: The active ``SemanticAwarePythonHasher`` for hashing + python_hasher: The active ``SemanticHasherProtocol`` for hashing Python objects. """ def __init__( self, type_converter: "UniversalTypeConverter", - python_hasher: "SemanticAwarePythonHasher", + python_hasher: "SemanticHasherProtocol", ) -> None: self._type_converter = type_converter self._python_hasher = python_hasher