Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
3be642a
docs(plt-1660): add design spec for hard cut to extension type hashing
kurodo3[bot] Jun 24, 2026
1f16a93
docs(plt-1660): update spec with protocol tightening and full renames
kurodo3[bot] Jun 24, 2026
f432895
docs(plt-1660): update binary encoding format to use "::" separator a…
kurodo3[bot] Jun 24, 2026
450dde8
refactor(hashing_protocols): rename TypeHandlerProtocol → PythonTypeS…
kurodo3[bot] Jun 24, 2026
55eea46
refactor(hashing_protocols): add SemanticAwarePythonHasher to TYPE_CH…
kurodo3[bot] Jun 24, 2026
5843544
refactor(type_handler_registry): rename to PythonTypeSemanticHasherRe…
kurodo3[bot] Jun 24, 2026
1f543f4
refactor(builtin_handlers): rename handler classes, tighten hash() → …
kurodo3[bot] Jun 24, 2026
895f885
refactor(semantic_hasher): rename BaseSemanticHasher → SemanticAwareP…
kurodo3[bot] Jun 24, 2026
b965fc5
refactor: update BaseSemanticHasher → SemanticAwarePythonHasher refs …
kurodo3[bot] Jun 24, 2026
0b8ae99
refactor(hashing): update __init__.py exports and versioned_hashers f…
kurodo3[bot] Jun 24, 2026
21cedfc
refactor(contexts): update v0.1.json context spec to use renamed clas…
kurodo3[bot] Jun 24, 2026
305735c
refactor(tests): update hashing tests for renamed classes and methods
kurodo3[bot] Jun 24, 2026
86870eb
test(semantic_hasher): rename _DummyHandler → _DummySemanticHasher, f…
kurodo3[bot] Jun 24, 2026
d170232
feat(visitors): add visit_extension dispatch; rewrite SemanticHashing…
kurodo3[bot] Jun 24, 2026
6bab2f4
fix(visitors): use real file in dispatch test, remove deferred typing…
kurodo3[bot] Jun 24, 2026
aaa3070
refactor(arrow_hashers): delete SemanticArrowHasher, finalize Starfix…
kurodo3[bot] Jun 24, 2026
ba3d977
test(starfix_arrow_hasher): update _make_hasher() for new constructor
kurodo3[bot] Jun 24, 2026
4cf7001
feat(v0.1): wire extension type hashing into default context; remove …
kurodo3[bot] Jun 24, 2026
f72832a
feat(PLT-1660): hard cut — delete SemanticTypeRegistry and old struct…
kurodo3[bot] Jun 24, 2026
8038507
fix(PLT-1660): fix broken get_default_arrow_hasher, add passthrough t…
kurodo3[bot] Jun 24, 2026
d34c504
docs(PLT-1660): add implementation plan for hard-cut extension type h…
kurodo3[bot] Jun 24, 2026
14478b3
fix(test-objective): update test_hashing.py for renamed hashing classes
kurodo3[bot] Jun 24, 2026
d29079c
fix(PLT-1660): address Copilot review — utf-8 encoding, return type a…
kurodo3[bot] Jun 24, 2026
28a0987
refactor(hashing): revert PythonTypeSemanticHasherProtocol.hash() to …
kurodo3[bot] Jun 24, 2026
a79641f
refactor(hashing): rename PythonTypeSemanticHasherProtocol → PythonTy…
kurodo3[bot] Jun 24, 2026
764a1bf
refactor(hashing): rename *SemanticHasher → *Handler, PythonTypeSeman…
kurodo3[bot] Jun 24, 2026
5c12aa0
docs(test_hashing): update stale BaseSemanticHasher → SemanticAwarePy…
kurodo3[bot] Jun 24, 2026
596333b
fix(context): rename function_info_extractor → function_semantic_hash…
kurodo3[bot] Jun 25, 2026
d71bf19
refactor(hashing): rename registry methods, add HandlerRegistryProtoc…
kurodo3[bot] Jun 25, 2026
e8129d7
refactor(hashing): enforce Protocol naming convention and decouple co…
kurodo3[bot] Jun 25, 2026
7caa8af
refactor(hashing): decouple builtin handlers from concrete types
kurodo3[bot] Jun 25, 2026
5c57b71
fix(hashing): complete HandlerRegistryProtocol and fix test docstring
kurodo3[bot] Jun 25, 2026
b63ff2d
docs(hashing): align docstrings with protocol parameter/return types
kurodo3[bot] Jun 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/orcapod/contexts/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class DataContext:
registration. This is the single public API for all type operations.
arrow_hasher: Arrow table hasher for this context
semantic_hasher: General semantic hasher for this context. The
``TypeHandlerRegistry`` used for hashing is accessible via
``PythonTypeHandlerRegistry`` used for hashing is accessible via
``semantic_hasher.type_handler_registry``.
"""

Expand Down
59 changes: 18 additions & 41 deletions src/orcapod/contexts/data/schemas/context_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,6 @@
"Enhanced version with timestamp support and improved hashing"
]
},
"semantic_registry": {
"$ref": "#/$defs/objectspec",
"description": "ObjectSpec for the semantic registry"
},
"type_converter": {
"$ref": "#/$defs/objectspec",
"description": "ObjectSpec for the python-arrow type converter"
Expand All @@ -57,17 +53,17 @@
"$ref": "#/$defs/objectspec",
"description": "ObjectSpec for the semantic hasher component"
},
"type_handler_registry": {
"python_type_handler_registry": {
"$ref": "#/$defs/objectspec",
"description": "ObjectSpec for the TypeHandlerRegistry used by the semantic hasher"
"description": "ObjectSpec for the PythonTypeHandlerRegistry used by the semantic hasher"
},
"file_hasher": {
"$ref": "#/$defs/objectspec",
"description": "ObjectSpec for the file content hasher (used by PathContentHandler)"
"description": "ObjectSpec for the file content hasher (used by PathHandler)"
},
"function_info_extractor": {
"function_semantic_hasher": {
"$ref": "#/$defs/objectspec",
"description": "ObjectSpec for the function info extractor (used by FunctionHandler)"
"description": "ObjectSpec for the function semantic hasher (used by FunctionHandler)"
},
"metadata": {
"type": "object",
Expand Down Expand Up @@ -167,51 +163,32 @@
{
"context_key": "std:v0.1:default",
"version": "v0.1",
"description": "Initial stable release with basic Path semantic type support",
"semantic_type_registry": {
"_class": "orcapod.types.semantic_types.SemanticTypeRegistry",
"_config": {
"converters": [
{
"_class": "orcapod.types.semantic_types.PythonPathStructConverter",
"_config": {}
}
]
}
"description": "Initial stable release with extension type hashing support",
"type_converter": {
"_class": "orcapod.semantic_types.universal_converter.UniversalTypeConverter",
"_config": {}
},
"arrow_hasher": {
"_class": "orcapod.hashing.arrow_hashers.SemanticArrowHasher",
"_class": "orcapod.hashing.arrow_hashers.StarfixArrowHasher",
"_config": {
"hasher_id": "arrow_v0.1",
"hash_algorithm": "sha256",
"serialization_method": "logical",
"semantic_type_hashers": {
"path": {
"_class": "orcapod.hashing.semantic_type_hashers.PathHasher",
"_config": {
"file_hasher": {
"_class": "orcapod.hashing.file_hashers.BasicFileHasher",
"_config": {
"algorithm": "sha256"
}
}
}
}
}
"type_converter": {"_ref": "type_converter"},
"semantic_hasher": {"_ref": "semantic_hasher"}
}
},
"semantic_hasher": {
"_class": "orcapod.hashing.semantic_hashing.semantic_hasher.BaseSemanticHasher",
"_class": "orcapod.hashing.semantic_hashing.semantic_hasher.SemanticAwarePythonHasher",
"_config": {
"hasher_id": "semantic_v0.1"
"hasher_id": "semantic_v0.1",
"type_handler_registry": {"_ref": "python_type_handler_registry"}
}
},
"metadata": {
"created_date": "2025-08-01",
"created_date": "2026-06-24",
"author": "OrcaPod Team",
"changelog": [
"Initial release with semantic type registry",
"Basic Arrow and object hashing capabilities"
"Initial release with extension type hashing support",
"StarfixArrowHasher for cross-language-compatible Arrow hashing"
]
}
}
Expand Down
83 changes: 32 additions & 51 deletions src/orcapod/contexts/data/v0.1.json
Original file line number Diff line number Diff line change
@@ -1,41 +1,13 @@
{
"context_key": "std:v0.1:default",
"version": "v0.1",
"description": "Initial stable release with basic Path semantic type support",
"description": "Initial stable release with extension type hashing support",
"file_hasher": {
"_class": "orcapod.hashing.file_hashers.BasicFileHasher",
"_config": {
"algorithm": "sha256"
}
},
"semantic_registry": {
"_class": "orcapod.semantic_types.semantic_registry.SemanticTypeRegistry",
"_config": {
"converters": {
"upath": {
"_class": "orcapod.semantic_types.semantic_struct_converters.UPathStructConverter",
"_config": {
"file_hasher": {"_ref": "file_hasher"}
}
},
"path": {
"_class": "orcapod.semantic_types.semantic_struct_converters.PythonPathStructConverter",
"_config": {
"file_hasher": {"_ref": "file_hasher"}
}
}
}
}
},
"arrow_hasher": {
"_class": "orcapod.hashing.arrow_hashers.StarfixArrowHasher",
"_config": {
"hasher_id": "arrow_v0.1",
"semantic_registry": {
"_ref": "semantic_registry"
}
}
},
"type_converter": {
"_class": "orcapod.semantic_types.universal_converter.UniversalTypeConverter",
"_config": {
Expand Down Expand Up @@ -78,52 +50,61 @@
}
}
},
"function_info_extractor": {
"function_semantic_hasher": {
"_class": "orcapod.hashing.semantic_hashing.function_info_extractors.FunctionSignatureExtractor",
"_config": {
"include_module": true,
"include_defaults": true
}
},
"type_handler_registry": {
"_class": "orcapod.hashing.semantic_hashing.type_handler_registry.TypeHandlerRegistry",
"python_type_handler_registry": {
"_class": "orcapod.hashing.semantic_hashing.type_handler_registry.PythonTypeHandlerRegistry",
"_config": {
"handlers": [
[{"_type": "builtins.bytes"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesHandler", "_config": {}}],
[{"_type": "builtins.bytearray"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesHandler", "_config": {}}],
[{"_type": "pathlib.Path"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.PathContentHandler", "_config": {"file_hasher": {"_ref": "file_hasher"}}}],
[{"_type": "upath.core.UPath"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UPathContentHandler", "_config": {"file_hasher": {"_ref": "file_hasher"}}}],
[{"_type": "uuid.UUID"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UUIDHandler", "_config": {}}],
[{"_type": "types.FunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}],
[{"_type": "types.BuiltinFunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}],
[{"_type": "types.MethodType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}],
[{"_type": "builtins.type"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.TypeObjectHandler", "_config": {}}],
[{"_type": "types.GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasHandler", "_config": {}}],
[{"_type": "types.UnionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UnionTypeHandler", "_config": {}}],
[{"_type": "typing._GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasHandler", "_config": {}}],
[{"_type": "typing._SpecialForm"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.SpecialFormHandler", "_config": {}}],
[{"_type": "pyarrow.Table"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableHandler", "_config": {"arrow_hasher": {"_ref": "arrow_hasher"}}}],
[{"_type": "pyarrow.RecordBatch"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableHandler", "_config": {"arrow_hasher": {"_ref": "arrow_hasher"}}}]
[{"_type": "builtins.bytes"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesHandler", "_config": {}}],
[{"_type": "builtins.bytearray"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesHandler", "_config": {}}],

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's add a follow up issue to make it such that the same handler can be registerd to multiple target classes and make use of MRO-based matching system already used many other places in the codebase.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Filed PLT-1827 to track this. No code change in this PR.

[{"_type": "pathlib.Path"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.PathHandler", "_config": {"file_hasher": {"_ref": "file_hasher"}}}],
[{"_type": "upath.core.UPath"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UPathHandler", "_config": {"file_hasher": {"_ref": "file_hasher"}}}],
[{"_type": "uuid.UUID"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UUIDHandler", "_config": {}}],
[{"_type": "types.FunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_semantic_hasher"}}}],
[{"_type": "types.BuiltinFunctionType"},{"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_semantic_hasher"}}}],
[{"_type": "types.MethodType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_semantic_hasher"}}}],
[{"_type": "builtins.type"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.TypeObjectHandler", "_config": {}}],
[{"_type": "types.GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasHandler", "_config": {}}],
[{"_type": "types.UnionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UnionTypeHandler", "_config": {}}],
[{"_type": "typing._GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasHandler", "_config": {}}],
[{"_type": "typing._SpecialForm"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.SpecialFormHandler", "_config": {}}],
[{"_type": "pyarrow.Table"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableHandler", "_config": {}}],
[{"_type": "pyarrow.RecordBatch"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableHandler", "_config": {}}]
]
}
},
"semantic_hasher": {
"_class": "orcapod.hashing.semantic_hashing.semantic_hasher.BaseSemanticHasher",
"_class": "orcapod.hashing.semantic_hashing.semantic_hasher.SemanticAwarePythonHasher",
"_config": {
"hasher_id": "semantic_v0.1",
"type_handler_registry": {
"_ref": "type_handler_registry"
"_ref": "python_type_handler_registry"
}
}
},
"arrow_hasher": {
"_class": "orcapod.hashing.arrow_hashers.StarfixArrowHasher",
"_config": {
"hasher_id": "arrow_v0.1",
"type_converter": {"_ref": "type_converter"},

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The fact arrow_hasher takes in type_converter and sematnic_hasher as its construtor argument actually makes arrow_hasher and sematic_hashser relationship circular in the default context. This strongly suggests we should unlink the circle by making one of them instantiate WITHOUT the other in the constructor. Rather, it should "optionally" accept the other (e.g. semantic_hasher) when invoking method on the arrow hasher.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Filed PLT-1826 to track the decoupling. The likely fix is to remove type_converter and semantic_hasher from StarfixArrowHasher.__init__ and accept them only at method-invocation time (e.g. hash_table(table, *, type_converter=None, semantic_hasher=None)), resolving lazily from get_default_context() when omitted. No code change in this PR.

"semantic_hasher": {"_ref": "semantic_hasher"}
}
},
"metadata": {
"created_date": "2025-08-01",
"created_date": "2026-06-24",
"author": "OrcaPod Core Team",
"changelog": [
"Initial release with Path semantic type support",
"Basic SHA-256 hashing for files and objects",
"Arrow logical serialization method",
"Introduced arrow_v0.1 StarfixArrowHasher using starfix ArrowDigester for cross-language-compatible Arrow hashing"
"Introduced arrow_v0.1 StarfixArrowHasher using starfix ArrowDigester for cross-language-compatible Arrow hashing",
"Hard cut: replaced shape-based SemanticTypeRegistry with extension-type hashing; renamed all hashing classes to cleaner names"
]
}
}
8 changes: 4 additions & 4 deletions src/orcapod/core/function_pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -656,7 +656,7 @@ def as_table(
return output_table


class CallableWithPod(Protocol):
class CallableWithPodProtocol(Protocol):
@property
def pod(self) -> _FunctionPodBase:
"""Return the associated function pod."""
Expand All @@ -676,7 +676,7 @@ def function_pod(
pod_cache_database: ArrowDatabaseProtocol | None = None,
executor: DataFunctionExecutorProtocol | None = None,
**kwargs,
) -> Callable[..., CallableWithPod]:
) -> Callable[..., CallableWithPodProtocol]:
"""Decorator that attaches a ``FunctionPod`` as a ``pod`` attribute.

Args:
Expand All @@ -696,7 +696,7 @@ def function_pod(
A decorator that adds a ``pod`` attribute to the wrapped function.
"""

def decorator(func: Callable) -> CallableWithPod:
def decorator(func: Callable) -> CallableWithPodProtocol:
if func.__name__ == "<lambda>":
raise ValueError("Lambda functions cannot be used with function_pod")

Expand Down Expand Up @@ -736,7 +736,7 @@ def wrapper(*args, **kwargs):
return func(*args, **kwargs)

setattr(wrapper, "pod", pod)
return cast(CallableWithPod, wrapper)
return cast(CallableWithPodProtocol, wrapper)

return decorator

Expand Down
4 changes: 2 additions & 2 deletions src/orcapod/extension_types/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,8 @@ class LogicalTypeRegistry:
An optional ``logical_types`` list can be passed at construction time to
pre-register one or more ``LogicalTypeProtocol`` instances immediately, following
the same pattern as ``SemanticTypeRegistry``'s ``converters`` constructor
argument.
the same pattern as the ``logical_types`` constructor argument used by
other registries in this package.
An optional ``factories`` list can also be passed to pre-register
``LogicalTypeFactoryProtocol`` instances at construction time. Each entry is a
Expand Down
Loading
Loading