CyberCRI · lpi-tn · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "welearn-database"
-version = "1.5.0"
+version = "1.5.1"
 description = "All stuff related to relationnal database from the WeLearn project"
 authors = [
     {name = "Théo",email = "theo.nardin@cri-paris.org"}

diff --git a/tests/test_document_related.py b/tests/test_document_related.py
@@ -1,4 +1,5 @@
 import uuid
+from dataclasses import dataclass
 from unittest import TestCase
 from zlib import adler32
 
@@ -20,6 +21,12 @@
 from welearn_database.exceptions import ContentIsTooShort, InvalidDOI, InvalidURLScheme
 
 
+@dataclass
+class AuthorDetails:
+    name: str
+    misc: str
+
+
 class TestWeLearnDocument(TestCase):
     def test_validate_url(self):
         test_doc = WeLearnDocument(
@@ -29,7 +36,7 @@ def test_validate_url(self):
             description="A short description of the test document.",
             lang="en",
             corpus="Test Corpus",
-            details={"author": "Test Author"},
+            details={"author": AuthorDetails(name="Test Author", misc="")},
         )
 
         self.assertEqual(test_doc.url, "https://example.com/test-document")
@@ -43,7 +50,7 @@ def test_validate_wrong_scheme_url(self):
                 description="A short description of the test document.",
                 lang="en",
                 corpus="Test Corpus",
-                details={"author": "Test Author"},
+                details={"author": AuthorDetails(name="Test Author", misc="")},
             )
 
     def test_validate_wrong_url(self):
@@ -55,7 +62,7 @@ def test_validate_wrong_url(self):
                 description="A short description of the test document.",
                 lang="en",
                 corpus="Test Corpus",
-                details={"author": "Test Author"},
+                details={"author": AuthorDetails(name="Test Author", misc="")},
             )
 
     def test_validate_full_content(self):
@@ -66,7 +73,7 @@ def test_validate_full_content(self):
             description="A short description of the test document.",
             lang="en",
             corpus="Test Corpus",
-            details={"author": "Test Author"},
+            details={"author": AuthorDetails(name="Test Author", misc="")},
         )
 
         self.assertEqual(
@@ -83,7 +90,7 @@ def test_validate_too_short_full_content(self):
                 description="A short description of the test document.",
                 lang="en",
                 corpus="Test Corpus",
-                details={"author": "Test Author"},
+                details={"author": AuthorDetails(name="Test Author", misc="")},
             )
 
     def test_full_content(self):
@@ -94,7 +101,7 @@ def test_full_content(self):
             description="A short description of the test document.",
             lang="en",
             corpus="Test Corpus",
-            details={"author": "Test Author"},
+            details={"author": AuthorDetails(name="Test Author", misc="")},
         )
 
         self.assertEqual(
@@ -110,7 +117,10 @@ def test_doi(self):
             description="A short description of the test document.",
             lang="en",
             corpus="Test Corpus",
-            details={"author": "Test Author", "doi": "10.1000/xyz123"},
+            details={
+                "author": AuthorDetails(name="Test Author", misc=""),
+                "doi": "10.1000/xyz123",
+            },
             doi="10.1000/xyz123",
         )
 
@@ -126,7 +136,7 @@ def test_invalid_doi(self):
                 lang="en",
                 corpus="Test Corpus",
                 details={
-                    "author": "Test Author",
+                    "author": AuthorDetails(name="Test Author", misc=""),
                     "doi": "11.1590/s0100-879x2002000500007",
                 },
                 doi="11.1590/s0100-879x2002000500007",
@@ -142,7 +152,7 @@ def test_unclean_doi(self):
                 lang="en",
                 corpus="Test Corpus",
                 details={
-                    "author": "Test Author",
+                    "author": AuthorDetails(name="Test Author", misc=""),
                     "doi": "https://doi.org/10.1000/xyz123",
                 },
                 doi="https://doi.org/10.1000/xyz123",
@@ -156,7 +166,7 @@ def test_description(self):
             description="<p>A short description &nbsp of the   test document.</p>",
             lang="en",
             corpus="Test Corpus",
-            details={"author": "Test Author"},
+            details={"author": AuthorDetails(name="Test Author", misc="")},
         )
 
         self.assertEqual(
@@ -202,7 +212,7 @@ def test_external_id(self):
             full_content="This is a test document, used for unit testing, please ignore. Thank you!",
             description="A short description of the test document.",
             lang="en",
-            details={"author": "Test Author"},
+            details={"author": AuthorDetails(name="Test Author", misc="")},
         )
 
         test_session.add(test_doc)
@@ -232,7 +242,7 @@ def test_trace(self):
             description="A short description of the test document.",
             lang="en",
             corpus="Test Corpus",
-            details={"author": "Test Author"},
+            details={"author": AuthorDetails(name="Test Author", misc="")},
         )
 
         self.assertEqual(test_doc.trace, expected_trace)
@@ -275,7 +285,7 @@ def test_trace_in_db(self):
             description="A short description of the test document.",
             lang="en",
             corpus_id=test_corpus.id,
-            details={"author": "Test Author"},
+            details={"author": AuthorDetails(name="Test Author", misc="")},
         )
         test_session.add(test_doc)
         test_session.commit()
@@ -322,7 +332,7 @@ def test_none_trace_in_db(self):
             description="A short description of the test document.",
             lang="en",
             corpus_id=test_corpus.id,
-            details={"author": "Test Author"},
+            details={"author": AuthorDetails(name="Test Author", misc="")},
         )
         test_session.add(test_doc)
         test_session.commit()
@@ -372,7 +382,7 @@ def test_view_qty_document(self):
                 description="A short description of the test document.",
                 lang="en",
                 corpus_id=test_corpus.id,
-                details={"author": "Test Author"},
+                details={"author": AuthorDetails(name="Test Author", misc="")},
             )
             test_session.add(test_doc)
             test_process_state = ProcessState(
@@ -441,7 +451,7 @@ def test_error_data_quality(self):
                 description="A short description of the test document.",
                 lang="en",
                 corpus_id=test_corpus.id,
-                details={"author": "Test Author"},
+                details={"author": AuthorDetails(name="Test Author", misc="")},
             )
             test_session.add(test_doc)
             test_process_state = ProcessState(

diff --git a/welearn_database/data/details_dict.py b/welearn_database/data/details_dict.py
@@ -0,0 +1,30 @@
+from dataclasses import asdict, is_dataclass
+
+from sqlalchemy import types
+
+
+class DetailsDict(types.TypeDecorator):
+    """
+    Convert dataclass instances into dictionaries for JSON storage.
+    """
+
+    impl = types.JSON
+    cache_ok = True
+
+    @staticmethod
+    def _is_dataclass_instance(obj):
+        return is_dataclass(obj) and not isinstance(obj, type)
+
+    def _inner_serialize_dataclass(self, value):
+        match value:
+            case list():
+                return [self._inner_serialize_dataclass(item) for item in value]
+            case dict():
+                return {k: self._inner_serialize_dataclass(v) for k, v in value.items()}
+        if self._is_dataclass_instance(value):
+            return asdict(value)
+        return value
+
+    def process_bind_param(self, value, dialect):
+        # Serialize recursively without mutating the original object stored on the ORM instance.
+        return self._inner_serialize_dataclass(value)
diff --git a/welearn_database/data/models/document_related.py b/welearn_database/data/models/document_related.py
@@ -16,6 +16,7 @@
 from sqlalchemy.dialects.postgresql import ARRAY, ENUM, TIMESTAMP
 from sqlalchemy.orm import Mapped, mapped_column, relationship, validates
 
+from welearn_database.data.details_dict import DetailsDict
 from welearn_database.data.enumeration import (
     ContextType,
     Counter,
@@ -83,7 +84,7 @@ class WeLearnDocument(Base):
     lang: Mapped[str | None]
     description: Mapped[str | None]
     full_content: Mapped[str | None]
-    details: Mapped[dict[str, Any] | None]
+    details: Mapped[dict[str, Any] | None] = mapped_column(DetailsDict)
     trace: Mapped[int | None] = mapped_column(types.BIGINT)
     corpus_id: Mapped[UUID] = mapped_column(
         types.Uuid,