From 9a9a2e1ab080f5aefe214cb7f37ec489e06ac936 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Thu, 18 Jun 2026 15:45:50 +0200 Subject: [PATCH 1/3] Bump version to 1.5.1 in pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b98f3f5..1d3cf12 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "welearn-database" -version = "1.5.0" +version = "1.5.1" description = "All stuff related to relationnal database from the WeLearn project" authors = [ {name = "Théo",email = "theo.nardin@cri-paris.org"} From 60fd989022c95bc7510521927c5bbb05a92a2db2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Thu, 18 Jun 2026 16:11:12 +0200 Subject: [PATCH 2/3] Add DetailsDict type decorator for serializing dataclass instances to dict --- tests/test_document_related.py | 42 +++++++++++------- welearn_database/data/details_dict.py | 44 +++++++++++++++++++ .../data/models/document_related.py | 3 +- 3 files changed, 72 insertions(+), 17 deletions(-) create mode 100644 welearn_database/data/details_dict.py diff --git a/tests/test_document_related.py b/tests/test_document_related.py index cf49cef..571c00f 100644 --- a/tests/test_document_related.py +++ b/tests/test_document_related.py @@ -1,4 +1,5 @@ import uuid +from dataclasses import dataclass from unittest import TestCase from zlib import adler32 @@ -20,6 +21,12 @@ from welearn_database.exceptions import ContentIsTooShort, InvalidDOI, InvalidURLScheme +@dataclass +class AuthorDetails: + name: str + misc: str + + class TestWeLearnDocument(TestCase): def test_validate_url(self): test_doc = WeLearnDocument( @@ -29,7 +36,7 @@ def test_validate_url(self): description="A short description of the test document.", lang="en", corpus="Test Corpus", - details={"author": "Test Author"}, + details={"author": AuthorDetails(name="Test Author", misc="")}, ) self.assertEqual(test_doc.url, "https://example.com/test-document") @@ -43,7 +50,7 @@ def test_validate_wrong_scheme_url(self): description="A short description of the test document.", lang="en", corpus="Test Corpus", - details={"author": "Test Author"}, + details={"author": AuthorDetails(name="Test Author", misc="")}, ) def test_validate_wrong_url(self): @@ -55,7 +62,7 @@ def test_validate_wrong_url(self): description="A short description of the test document.", lang="en", corpus="Test Corpus", - details={"author": "Test Author"}, + details={"author": AuthorDetails(name="Test Author", misc="")}, ) def test_validate_full_content(self): @@ -66,7 +73,7 @@ def test_validate_full_content(self): description="A short description of the test document.", lang="en", corpus="Test Corpus", - details={"author": "Test Author"}, + details={"author": AuthorDetails(name="Test Author", misc="")}, ) self.assertEqual( @@ -83,7 +90,7 @@ def test_validate_too_short_full_content(self): description="A short description of the test document.", lang="en", corpus="Test Corpus", - details={"author": "Test Author"}, + details={"author": AuthorDetails(name="Test Author", misc="")}, ) def test_full_content(self): @@ -94,7 +101,7 @@ def test_full_content(self): description="A short description of the test document.", lang="en", corpus="Test Corpus", - details={"author": "Test Author"}, + details={"author": AuthorDetails(name="Test Author", misc="")}, ) self.assertEqual( @@ -110,7 +117,10 @@ def test_doi(self): description="A short description of the test document.", lang="en", corpus="Test Corpus", - details={"author": "Test Author", "doi": "10.1000/xyz123"}, + details={ + "author": AuthorDetails(name="Test Author", misc=""), + "doi": "10.1000/xyz123", + }, doi="10.1000/xyz123", ) @@ -126,7 +136,7 @@ def test_invalid_doi(self): lang="en", corpus="Test Corpus", details={ - "author": "Test Author", + "author": AuthorDetails(name="Test Author", misc=""), "doi": "11.1590/s0100-879x2002000500007", }, doi="11.1590/s0100-879x2002000500007", @@ -142,7 +152,7 @@ def test_unclean_doi(self): lang="en", corpus="Test Corpus", details={ - "author": "Test Author", + "author": AuthorDetails(name="Test Author", misc=""), "doi": "https://doi.org/10.1000/xyz123", }, doi="https://doi.org/10.1000/xyz123", @@ -156,7 +166,7 @@ def test_description(self): description="

A short description   of the test document.

", lang="en", corpus="Test Corpus", - details={"author": "Test Author"}, + details={"author": AuthorDetails(name="Test Author", misc="")}, ) self.assertEqual( @@ -202,7 +212,7 @@ def test_external_id(self): full_content="This is a test document, used for unit testing, please ignore. Thank you!", description="A short description of the test document.", lang="en", - details={"author": "Test Author"}, + details={"author": AuthorDetails(name="Test Author", misc="")}, ) test_session.add(test_doc) @@ -232,7 +242,7 @@ def test_trace(self): description="A short description of the test document.", lang="en", corpus="Test Corpus", - details={"author": "Test Author"}, + details={"author": AuthorDetails(name="Test Author", misc="")}, ) self.assertEqual(test_doc.trace, expected_trace) @@ -275,7 +285,7 @@ def test_trace_in_db(self): description="A short description of the test document.", lang="en", corpus_id=test_corpus.id, - details={"author": "Test Author"}, + details={"author": AuthorDetails(name="Test Author", misc="")}, ) test_session.add(test_doc) test_session.commit() @@ -322,7 +332,7 @@ def test_none_trace_in_db(self): description="A short description of the test document.", lang="en", corpus_id=test_corpus.id, - details={"author": "Test Author"}, + details={"author": AuthorDetails(name="Test Author", misc="")}, ) test_session.add(test_doc) test_session.commit() @@ -372,7 +382,7 @@ def test_view_qty_document(self): description="A short description of the test document.", lang="en", corpus_id=test_corpus.id, - details={"author": "Test Author"}, + details={"author": AuthorDetails(name="Test Author", misc="")}, ) test_session.add(test_doc) test_process_state = ProcessState( @@ -441,7 +451,7 @@ def test_error_data_quality(self): description="A short description of the test document.", lang="en", corpus_id=test_corpus.id, - details={"author": "Test Author"}, + details={"author": AuthorDetails(name="Test Author", misc="")}, ) test_session.add(test_doc) test_process_state = ProcessState( diff --git a/welearn_database/data/details_dict.py b/welearn_database/data/details_dict.py new file mode 100644 index 0000000..444ecf1 --- /dev/null +++ b/welearn_database/data/details_dict.py @@ -0,0 +1,44 @@ +from dataclasses import asdict, is_dataclass + +from sqlalchemy import types + + +class DetailsDict(types.TypeDecorator): + """ + Convert all dataclass into dict + """ + + impl = types.JSON + + @staticmethod + def _is_dataclass_instance(obj): + return is_dataclass(obj) and not isinstance(obj, type) + + def _inner_serialize_dataclass(self, value): + match value: + case list(): + return [self._inner_serialize_dataclass(item) for item in value] + case dict(): + return {k: self._inner_serialize_dataclass(v) for k, v in value.items()} + if self._is_dataclass_instance(value): + return asdict(value) + return value + + def process_bind_param(self, value, dialect): + if isinstance(value, dict): + for detail_key, detail_value in value.items(): + match detail_value: + case list(): + value[detail_key] = [ + self._inner_serialize_dataclass(item) + for item in detail_value + ] + case dict(): + for k, v in detail_value.items(): + detail_value[k] = self._inner_serialize_dataclass(v) + value[detail_key] = detail_value + case _: + value[detail_key] = self._inner_serialize_dataclass( + detail_value + ) + return value diff --git a/welearn_database/data/models/document_related.py b/welearn_database/data/models/document_related.py index 659c0f6..6d0e996 100644 --- a/welearn_database/data/models/document_related.py +++ b/welearn_database/data/models/document_related.py @@ -16,6 +16,7 @@ from sqlalchemy.dialects.postgresql import ARRAY, ENUM, TIMESTAMP from sqlalchemy.orm import Mapped, mapped_column, relationship, validates +from welearn_database.data.details_dict import DetailsDict from welearn_database.data.enumeration import ( ContextType, Counter, @@ -83,7 +84,7 @@ class WeLearnDocument(Base): lang: Mapped[str | None] description: Mapped[str | None] full_content: Mapped[str | None] - details: Mapped[dict[str, Any] | None] + details: Mapped[dict[str, Any] | None] = mapped_column(DetailsDict) trace: Mapped[int | None] = mapped_column(types.BIGINT) corpus_id: Mapped[UUID] = mapped_column( types.Uuid, From 119adf91a4d5fc8d5d7a98c85ceb8510dc1373a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= <133012334+lpi-tn@users.noreply.github.com> Date: Thu, 18 Jun 2026 16:23:37 +0200 Subject: [PATCH 3/3] Apply suggestions from code review Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- welearn_database/data/details_dict.py | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/welearn_database/data/details_dict.py b/welearn_database/data/details_dict.py index 444ecf1..9eb0ebc 100644 --- a/welearn_database/data/details_dict.py +++ b/welearn_database/data/details_dict.py @@ -5,10 +5,11 @@ class DetailsDict(types.TypeDecorator): """ - Convert all dataclass into dict + Convert dataclass instances into dictionaries for JSON storage. """ impl = types.JSON + cache_ok = True @staticmethod def _is_dataclass_instance(obj): @@ -25,20 +26,5 @@ def _inner_serialize_dataclass(self, value): return value def process_bind_param(self, value, dialect): - if isinstance(value, dict): - for detail_key, detail_value in value.items(): - match detail_value: - case list(): - value[detail_key] = [ - self._inner_serialize_dataclass(item) - for item in detail_value - ] - case dict(): - for k, v in detail_value.items(): - detail_value[k] = self._inner_serialize_dataclass(v) - value[detail_key] = detail_value - case _: - value[detail_key] = self._inner_serialize_dataclass( - detail_value - ) - return value + # Serialize recursively without mutating the original object stored on the ORM instance. + return self._inner_serialize_dataclass(value)