From 12db02bc0f5ac554360563223e4c0af3aa496183 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Mon, 15 Jun 2026 17:25:54 +0200 Subject: [PATCH 01/30] create me a Compatibility class --- backend/lib/processor.py | 45 +++++++- common/lib/compatibility.py | 212 ++++++++++++++++++++++++++++++++++++ 2 files changed, 253 insertions(+), 4 deletions(-) create mode 100644 common/lib/compatibility.py diff --git a/backend/lib/processor.py b/backend/lib/processor.py index b4f1c05aa..f74ef331a 100644 --- a/backend/lib/processor.py +++ b/backend/lib/processor.py @@ -17,6 +17,7 @@ from backend.lib.worker import BasicWorker from common.lib.dataset import DataSet, StatusType +from common.lib.compatibility import Compatibility from common.lib.fourcat_module import FourcatModule from common.lib.helpers import get_software_commit, remove_nuls, send_email, hash_to_md5 from common.lib.exceptions import (WorkerInterruptedException, ProcessorInterruptedException, ProcessorException, @@ -37,10 +38,18 @@ class BasicProcessor(FourcatModule, BasicWorker, metaclass=abc.ABCMeta): be used as input for another processor (though whether and when this is useful is another question). - To determine whether a processor can process a given dataset, you can - define a `is_compatible_with(FourcatModule module=None, config=None):) -> bool` class - method which takes a dataset as argument and returns a bool that determines - if this processor is considered compatible with that dataset. For example: + To determine whether a processor can process a given dataset, declare a + Compatibility specification as the `compatibility` class attribute. The + default `is_compatible_with` is evaluated from it. For example: + + .. code-block:: python + + compatibility = Compatibility(types={"linguistic-features"}) + + Processors with genuinely dynamic requirements (e.g. ones that must inspect + a dataset's genealogy) may instead override `is_compatible_with(cls, + module=None, config=None) -> bool` directly; an override takes precedence + over the `compatibility` attribute. For example: .. code-block:: python @@ -97,6 +106,11 @@ def is_compatible_with(cls, module=None, config=None): #: `remove_disposable_files()` method will be called. for_cleanup = None + #: A common.lib.compatibility.Compatibility object describing which datasets + #: this processor accepts. When set, the default is_compatible_with() is + #: evaluated from it. + compatibility = None + def work(self): """ Process a dataset @@ -973,6 +987,29 @@ def _validate_map_item_post_run(self): except Exception: pass + @classmethod + def is_compatible_with(cls, module=None, config=None): + """ + Determine whether this processor can run on a given module. + + When the processor defines a `compatibility` attribute, this is + evaluated from it. Processors whose requirements cannot be expressed + that way (for example, ones that must inspect a dataset's ancestry) may + override this method instead; the override is used in preference to the + attribute. + + When neither is provided, the processor accepts only top-level datasets + (those without a parent), which preserves the historical default. + + :param module: Dataset (normally) or processor to check against + :param ConfigManager|None config: Context-aware configuration reader + :return bool: + """ + if cls.compatibility is not None: + return cls.compatibility.is_compatible_with(module, config=config) + + return bool(module is not None and module.is_top_dataset()) + @classmethod def is_filter(cls): """ diff --git a/common/lib/compatibility.py b/common/lib/compatibility.py new file mode 100644 index 000000000..c43660af8 --- /dev/null +++ b/common/lib/compatibility.py @@ -0,0 +1,212 @@ +""" +Declarative processor compatibility. + +A Compatibility object describes the conditions under which a processor can run +on a dataset: + +* the data shape it consumes -- the dataset's type, file extension, media type, + datasource, and any columns it needs; +* the environment it needs -- external executables and 4CAT configuration + settings; +* the follow-up processors that are most relevant for its output, and any that + should never be offered. + +A processor declares one as its `compatibility` class attribute, for example:: + + compatibility = Compatibility( + media_types={"video"}, + type_prefixes={"video-downloader"}, + required_settings={("video-downloader.ffmpeg_path", shutil.which)}, + ) + +BasicProcessor.is_compatible_with() evaluates it. A processor whose +requirements cannot be expressed this way -- for example one that must inspect +a dataset's ancestry -- may override is_compatible_with() instead; the override +is used in preference to the attribute. + +`_maybe_call`: a utility function to safely read attributes or call methods on a + module, handling cases where the attribute or method might not exist or raise an exception. +Normally a `module` is a DataSet, but the values read here (its type, extension, media type +and so on) are also available on a processor class, so a processor can be checked even when +no dataset exists yet. +""" +from __future__ import annotations + +import shutil +from dataclasses import dataclass +from typing import Iterable, List, Optional + + +def _maybe_call(module, method): + """ + Read `module.method` without assuming it exists. + + Calls it and returns the result when it is a method, returns the value when + it is a plain attribute, and returns None when it is missing or raises. A + DataSet exposes these as methods; a processor class exposes some of them as + well, and this keeps the same check working for both. + """ + attr = getattr(module, method, None) + if attr is None: + return None + if callable(attr): + try: + return attr() + except Exception: + return None + return attr + + +@dataclass +class Compatibility: + """ + Declarative compatibility specification for a processor. + + Any axis left unset (None, or empty) is not checked. + + The four identity axes -- types, type_prefixes, media_types and + datasources -- describe what kind of dataset the processor accepts. If any + of them are set, the module must match at least one (they are OR-ed). + Every other axis is an additional requirement that must also hold (they are + AND-ed). + """ + + # --- consumed data shape: identity (the module must match one of these) --- + # Dataset types the processor accepts, matched exactly. + types: Optional[Iterable[str]] = None + # Dataset type prefixes the processor accepts, matched with str.startswith. + type_prefixes: Optional[Iterable[str]] = None + # Media types the processor accepts, e.g. {"video", "image", "audio", "text"}. + media_types: Optional[Iterable[str]] = None + # Datasources the processor accepts, e.g. {"4chan", "reddit"}. + datasources: Optional[Iterable[str]] = None + + # --- structural gates (each must hold when set) --- + # Result-file extensions the processor accepts, e.g. {"csv", "ndjson"}. + extensions: Optional[Iterable[str]] = None + # When True, the processor only accepts a top-level dataset (one with no parent). + top_dataset_only: bool = False + # When set, the dataset's is_rankable() must equal this. None means it does not matter. + rankable: Optional[bool] = None + # Columns that must all be present in the dataset. This can only be checked + # against a real dataset, as it reads the dataset's columns. + requires_columns: Iterable[str] = () + + # --- environment requirements --- + # Executables that must be found on the system path (checked with shutil.which). + required_packages: Iterable[str] = () + # Configuration the processor needs. Each entry is either a setting key, + # which must resolve to a truthy value, or a (key, expected) pair. The + # expected part may be a single value the setting must equal, a collection + # the setting's value must be in, or a function that receives the value and + # returns whether it is acceptable. + required_settings: Iterable = () + + # --- follow-up processors --- + # Processor types to recommend first as next steps for this processor's output. + preferred_followups: Iterable[str] = () + # Processor types that should never be offered as follow-ups here, even + # when they are otherwise compatible. + excluded_followups: Iterable[str] = () + + def is_compatible_with(self, module, config=None) -> bool: + """ + Return whether `module` meets every requirement in this specification. + + `module` is normally a DataSet but may be a processor class. `config` + is the configuration reader, or None when none is available. + """ + return not self.unmet_requirements(module, config=config) + + def unmet_requirements(self, module, config=None) -> List[str]: + """ + Return the requirements `module` does not meet, as readable strings. + + An empty list means `module` is compatible. Each string names one thing + that is missing -- a wrong dataset type, an absent column, a setting + that is not configured, and so on. + """ + reasons: List[str] = [] + if module is None: + return ["no dataset provided"] + + # if the processor names the kinds of dataset it accepts, the module + # must be one of them + if self._identity_declared() and not self._identity_matches(module): + reasons.append("dataset type/media is not accepted") + + if self.top_dataset_only and not _maybe_call(module, "is_top_dataset"): + reasons.append("requires a top-level dataset") + + if self.extensions is not None: + extension = _maybe_call(module, "get_extension") + if extension not in set(self.extensions): + reasons.append("requires extension: %s" % ", ".join(self.extensions)) + + if self.rankable is not None: + if bool(_maybe_call(module, "is_rankable")) != self.rankable: + reasons.append( + "requires a rankable dataset" if self.rankable + else "requires a non-rankable dataset" + ) + + # the only check that really needs a DataSet object + if self.requires_columns: + columns = _maybe_call(module, "get_columns") or [] + missing = [column for column in self.requires_columns if column not in columns] + if missing: + reasons.append("requires column(s): %s" % ", ".join(missing)) + + for requirement in self.required_settings: + key, expected = (requirement, None) if isinstance(requirement, str) else requirement + value = config.get(key) if config is not None else None + # no expected value; just check that the setting is truthy + if expected is None: + met = bool(value) + # some function to check (special check for things like {"video-downloader.ffmpeg_path": lambda p: shutil.which(p) is not None}) + elif callable(expected): + met = bool(expected(value)) + # a collection of acceptable values + elif isinstance(expected, (set, frozenset, list, tuple)): + met = value in expected + # a single expected value + else: + met = value == expected + if not met: + reasons.append("requires setting: %s" % key) + + for package in self.required_packages: + if not shutil.which(package): + reasons.append("requires package: %s" % package) + + return reasons + + def _identity_declared(self) -> bool: + """Whether the processor names any kind of dataset it accepts.""" + return any( + axis is not None + for axis in (self.types, self.type_prefixes, self.media_types, self.datasources) + ) + + def _identity_matches(self, module) -> bool: + """Whether the module is one of the kinds of dataset the processor accepts.""" + module_type = getattr(module, "type", None) + + if self.types is not None and module_type in set(self.types): + return True + + if self.type_prefixes is not None and module_type is not None \ + and any(module_type.startswith(prefix) for prefix in self.type_prefixes): + return True + + if self.media_types is not None: + media = _maybe_call(module, "get_media_type") or getattr(module, "media_type", None) + if media in set(self.media_types): + return True + + if self.datasources is not None: + parameters = getattr(module, "parameters", None) or {} + if isinstance(parameters, dict) and parameters.get("datasource") in set(self.datasources): + return True + + return False From bfaa455ec1fe07e6275cc0dd8b37b86ffb8ebdc1 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Mon, 15 Jun 2026 17:26:20 +0200 Subject: [PATCH 02/30] test it on some processors --- processors/text-analysis/tokenise.py | 12 ++---------- processors/visualisation/word-cloud.py | 15 ++++++--------- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py index 58f45f6a2..c2d2c183c 100644 --- a/processors/text-analysis/tokenise.py +++ b/processors/text-analysis/tokenise.py @@ -17,6 +17,7 @@ from common.lib.helpers import UserInput, get_interval_descriptor from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = ["Stijn Peeters", "Sal Hagen"] __credits__ = ["Stijn Peeters", "Sal Hagen"] @@ -47,16 +48,7 @@ class Tokenise(BasicProcessor): "[Words in OpenTaal word list](https://github.com/OpenTaal/opentaal-wordlist)" ] - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on all csv and NDJSON datasets - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - - return module.get_extension() in ("csv", "ndjson") + compatibility = Compatibility(extensions={"csv", "ndjson"}) @classmethod def get_options(cls, parent_dataset=None, config=None): diff --git a/processors/visualisation/word-cloud.py b/processors/visualisation/word-cloud.py index eea5b5aaa..be922a0a5 100644 --- a/processors/visualisation/word-cloud.py +++ b/processors/visualisation/word-cloud.py @@ -6,6 +6,7 @@ from wordcloud import WordCloud from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Sal Hagen" @@ -24,15 +25,11 @@ class MakeWordCloud(BasicProcessor): description = "Generates a word cloud with words sized on occurrence." # description displayed in UI extension = "svg" - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on rankable items - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ("tfidf", "collocations", "vector-ranker", "vectorise-tokens-by-category", "similar-word2vec", "extract-nouns", "get-entities") + # Allow processor on rankable items + compatibility = Compatibility(types={ + "tfidf", "collocations", "vector-ranker", "vectorise-tokens-by-category", + "similar-word2vec", "extract-nouns", "get-entities" + }) @classmethod def get_options(cls, parent_dataset=None, config=None): From 5eb8addd17326180c45b729d0bcbdb78b1193aba Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Mon, 15 Jun 2026 19:06:04 +0200 Subject: [PATCH 03/30] processor: clear default behavoir --- backend/lib/processor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/lib/processor.py b/backend/lib/processor.py index f74ef331a..369ff2019 100644 --- a/backend/lib/processor.py +++ b/backend/lib/processor.py @@ -1008,7 +1008,11 @@ def is_compatible_with(cls, module=None, config=None): if cls.compatibility is not None: return cls.compatibility.is_compatible_with(module, config=config) - return bool(module is not None and module.is_top_dataset()) + # Legacy default: a processor that declares no `compatibility` and does + # not override this method is compatible only with top-level datasets + # (those with no parent), i.e. it runs on collected data and not on the + # output of other processors. + return Compatibility(top_dataset_only=True).is_compatible_with(module, config=config) @classmethod def is_filter(cls): From d7cd873516806548b74c511d1452eb38702f5b4f Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Mon, 15 Jun 2026 19:06:26 +0200 Subject: [PATCH 04/30] convert more type processors to use Compatibility --- processors/conversion/clarifai_to_csv.py | 12 +++--------- processors/networks/coword_network.py | 12 +++--------- processors/text-analysis/collocations.py | 12 +++--------- processors/text-analysis/documents_per_topic.py | 12 +++--------- processors/visualisation/youtube_thumbnails.py | 12 +++--------- 5 files changed, 15 insertions(+), 45 deletions(-) diff --git a/processors/conversion/clarifai_to_csv.py b/processors/conversion/clarifai_to_csv.py index 760b5c9c8..6aeb08168 100644 --- a/processors/conversion/clarifai_to_csv.py +++ b/processors/conversion/clarifai_to_csv.py @@ -4,6 +4,7 @@ import csv from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -27,15 +28,8 @@ class ConvertClarifaiOutputToCSV(BasicProcessor): description = "Convert the Clarifai API output to a simplified CSV file." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible - - :param module: Module determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "clarifai-api" + # Allow processor on Clarifai API output + compatibility = Compatibility(types={"clarifai-api"}) def process(self): """ diff --git a/processors/networks/coword_network.py b/processors/networks/coword_network.py index ee5c8dbc8..14be06c44 100644 --- a/processors/networks/coword_network.py +++ b/processors/networks/coword_network.py @@ -3,6 +3,7 @@ """ from backend.lib.preset import ProcessorPreset +from common.lib.compatibility import Compatibility __author__ = "Sal Hagen" __credits__ = ["Sal Hagen"] @@ -22,15 +23,8 @@ class CowordNetworker(ProcessorPreset): "amount of co-word occurrences." # description displayed in UI extension = "gexf" # extension of result file, used internally and in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on collocations - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "collocations" + # Allow processor to run on collocations + compatibility = Compatibility(types={"collocations"}) def get_processor_pipeline(self): """ diff --git a/processors/text-analysis/collocations.py b/processors/text-analysis/collocations.py index 2b9ffe71c..179d21731 100644 --- a/processors/text-analysis/collocations.py +++ b/processors/text-analysis/collocations.py @@ -10,6 +10,7 @@ from common.lib.helpers import UserInput from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility class GetCollocations(BasicProcessor): @@ -24,15 +25,8 @@ class GetCollocations(BasicProcessor): followups = ["preset-coword-network", "wordcloud"] - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on token sets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "tokenise-posts" + # Allow processor on token sets + compatibility = Compatibility(types={"tokenise-posts"}) @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: diff --git a/processors/text-analysis/documents_per_topic.py b/processors/text-analysis/documents_per_topic.py index 370c8d778..75bf3bf90 100644 --- a/processors/text-analysis/documents_per_topic.py +++ b/processors/text-analysis/documents_per_topic.py @@ -3,6 +3,7 @@ """ from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException import json @@ -26,15 +27,8 @@ class TopicModelWordExtractor(BasicProcessor): followups = [] - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on topic models - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "topic-modeller" + # Allow processor on topic models + compatibility = Compatibility(types={"topic-modeller"}) def process(self): """ diff --git a/processors/visualisation/youtube_thumbnails.py b/processors/visualisation/youtube_thumbnails.py index e5feb4249..58b955154 100644 --- a/processors/visualisation/youtube_thumbnails.py +++ b/processors/visualisation/youtube_thumbnails.py @@ -7,6 +7,7 @@ from apiclient.discovery import build from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException from common.lib.helpers import get_yt_compatible_ids, UserInput @@ -35,15 +36,8 @@ class YouTubeThumbnails(BasicProcessor): max_retries = 3 sleep_time = 10 - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on YouTube metadata sets - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "youtube-metadata" + # Allow processor on YouTube metadata sets + compatibility = Compatibility(types={"youtube-metadata"}) @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: From 97ba63afda76ca1a96f0e9081a33aca6ce702708 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 16 Jun 2026 11:24:14 +0200 Subject: [PATCH 05/30] update type is_compatible_with checks --- .../conversion/twitter_ndjson_to_tcat_json.py | 11 +++------ processors/conversion/vision_api_to_csv.py | 12 +++------- processors/filtering/tiktok_refresh.py | 12 +++------- processors/metrics/group_hashes.py | 13 +++-------- .../networks/clarifai_bipartite_network.py | 12 +++------- .../google_vision_bipartite_network.py | 12 +++------- processors/networks/google_vision_network.py | 12 +++------- .../networks/hash_similarity_network.py | 10 +++----- .../text-analysis/generate_embeddings.py | 12 +++------- processors/text-analysis/post_topic_matrix.py | 12 +++------- processors/text-analysis/similar_words.py | 12 +++------- processors/text-analysis/tf_idf.py | 12 +++------- processors/text-analysis/top_vectors.py | 12 +++------- processors/text-analysis/topic_modeling.py | 12 +++------- processors/text-analysis/topic_words.py | 12 +++------- processors/text-analysis/vectorise.py | 12 +++------- processors/text-analysis/vectorise_by_cat.py | 11 +++------ processors/twitter/mention_export.py | 23 ++++--------------- processors/twitter/user_visibility.py | 12 +++------- processors/visualisation/download_tiktok.py | 12 +++------- processors/visualisation/histwords.py | 12 +++------- processors/visualisation/video_hasher.py | 17 ++++---------- processors/visualisation/video_timelines.py | 18 ++++----------- processors/visualisation/youtube_imagewall.py | 12 +++------- 24 files changed, 77 insertions(+), 230 deletions(-) diff --git a/processors/conversion/twitter_ndjson_to_tcat_json.py b/processors/conversion/twitter_ndjson_to_tcat_json.py index aecf9e2a3..0df026298 100644 --- a/processors/conversion/twitter_ndjson_to_tcat_json.py +++ b/processors/conversion/twitter_ndjson_to_tcat_json.py @@ -4,6 +4,7 @@ import json from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -22,14 +23,8 @@ class ConvertNDJSONToJSON(BasicProcessor): followups = ["tcat-auto-upload"] - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Module to determine compatibility with - """ - return module.type == "twitterv2-search" + # Allow processor on Twitter/X (API v2) datasets + compatibility = Compatibility(types={"twitterv2-search"}) def process(self): """ diff --git a/processors/conversion/vision_api_to_csv.py b/processors/conversion/vision_api_to_csv.py index d2d733e04..8bd096698 100644 --- a/processors/conversion/vision_api_to_csv.py +++ b/processors/conversion/vision_api_to_csv.py @@ -4,6 +4,7 @@ import csv from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Stijn Peeters" @@ -49,15 +50,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "google-vision-api" + # Allow processor on Google Vision API output + compatibility = Compatibility(types={"google-vision-api"}) def process(self): """ diff --git a/processors/filtering/tiktok_refresh.py b/processors/filtering/tiktok_refresh.py index 5de1fe473..a4cb52458 100644 --- a/processors/filtering/tiktok_refresh.py +++ b/processors/filtering/tiktok_refresh.py @@ -6,6 +6,7 @@ from datasources.tiktok_urls.search_tiktok_urls import TikTokScraper from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" @@ -21,15 +22,8 @@ class UpdateTikTok(BasicProcessor): description = "Re-query TikTok URLs to update the dataset, e.g. to refresh video URLs or like counts." extension = "ndjson" - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on NDJSON and CSV files - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["tiktok-search", "tiktok-urls-search"] + # Allow processor on TikTok datasets + compatibility = Compatibility(types={"tiktok-search", "tiktok-urls-search"}) def process(self): """ diff --git a/processors/metrics/group_hashes.py b/processors/metrics/group_hashes.py index 6545a8528..cb79211fd 100644 --- a/processors/metrics/group_hashes.py +++ b/processors/metrics/group_hashes.py @@ -3,6 +3,7 @@ import imagehash from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException from common.lib.helpers import UserInput, normalize_crhash_components @@ -44,16 +45,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on above hasher output - - Could also allow on any CSV with appropriate fields - - :param module: Module to determine compatibility with - """ - return module.type == "image-hasher" + # Allow processor on image-hasher output (could also work on any CSV with the right fields) + compatibility = Compatibility(types={"image-hasher"}) @staticmethod def compute_groups(hashes, hash_type: str, hash_size: int | None, similarity_pct: float) -> list[int]: diff --git a/processors/networks/clarifai_bipartite_network.py b/processors/networks/clarifai_bipartite_network.py index d3d344b38..8587275ae 100644 --- a/processors/networks/clarifai_bipartite_network.py +++ b/processors/networks/clarifai_bipartite_network.py @@ -2,6 +2,7 @@ Google Vision API co-label network """ from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Stijn Peeters" @@ -46,15 +47,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on Google Vision API data - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "clarifai-api" + # Allow processor to run on Clarifai API data + compatibility = Compatibility(types={"clarifai-api"}) def process(self): """ diff --git a/processors/networks/google_vision_bipartite_network.py b/processors/networks/google_vision_bipartite_network.py index df48d5321..e281ee0bb 100644 --- a/processors/networks/google_vision_bipartite_network.py +++ b/processors/networks/google_vision_bipartite_network.py @@ -2,6 +2,7 @@ Google Vision API co-label network """ from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput from common.lib.exceptions import ProcessorInterruptedException @@ -61,15 +62,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on Google Vision API data - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "google-vision-api" + # Allow processor to run on Google Vision API data + compatibility = Compatibility(types={"google-vision-api"}) def process(self): """ diff --git a/processors/networks/google_vision_network.py b/processors/networks/google_vision_network.py index 258a3e975..dbc284df4 100644 --- a/processors/networks/google_vision_network.py +++ b/processors/networks/google_vision_network.py @@ -2,6 +2,7 @@ Google Vision API co-label network """ from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput from common.lib.exceptions import ProcessorInterruptedException @@ -60,15 +61,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on Google Vision API data - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "google-vision-api" + # Allow processor to run on Google Vision API data + compatibility = Compatibility(types={"google-vision-api"}) def process(self): """ diff --git a/processors/networks/hash_similarity_network.py b/processors/networks/hash_similarity_network.py index 997c2a8a4..df63bcd54 100644 --- a/processors/networks/hash_similarity_network.py +++ b/processors/networks/hash_similarity_network.py @@ -7,6 +7,7 @@ import numpy as np from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorException from common.lib.helpers import UserInput @@ -67,13 +68,8 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Currently only allowed on video-hashes, but technically any row of bit hashes will work. Could check for "hash" - in columns, but... how to make that a check as a classmethod? - """ - return module.type == "video-hashes" + # Currently only allowed on video-hashes, though any row of bit hashes would work. + compatibility = Compatibility(types={"video-hashes"}) def process(self): """ diff --git a/processors/text-analysis/generate_embeddings.py b/processors/text-analysis/generate_embeddings.py index 4dd6cdc7b..d0159aa1f 100644 --- a/processors/text-analysis/generate_embeddings.py +++ b/processors/text-analysis/generate_embeddings.py @@ -10,6 +10,7 @@ from common.lib.helpers import UserInput, convert_to_int from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException __author__ = "Sal Hagen" @@ -114,15 +115,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on token sets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "tokenise-posts" + # Allow processor on token sets + compatibility = Compatibility(types={"tokenise-posts"}) def process(self): """ diff --git a/processors/text-analysis/post_topic_matrix.py b/processors/text-analysis/post_topic_matrix.py index 4351eac3a..78d452b67 100644 --- a/processors/text-analysis/post_topic_matrix.py +++ b/processors/text-analysis/post_topic_matrix.py @@ -4,6 +4,7 @@ from common.lib.helpers import UserInput from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException import csv @@ -75,15 +76,8 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on topic models - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "topic-modeller" + # Allow processor on topic models + compatibility = Compatibility(types={"topic-modeller"}) def process(self): """ diff --git a/processors/text-analysis/similar_words.py b/processors/text-analysis/similar_words.py index 9aa0996f5..5dbdfb2f6 100644 --- a/processors/text-analysis/similar_words.py +++ b/processors/text-analysis/similar_words.py @@ -7,6 +7,7 @@ from common.lib.helpers import UserInput, convert_to_int, convert_to_float from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException __author__ = "Sal Hagen" @@ -67,15 +68,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on word embedding models - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "generate-embeddings" + # Allow processor on word embedding models + compatibility = Compatibility(types={"generate-embeddings"}) def process(self): """ diff --git a/processors/text-analysis/tf_idf.py b/processors/text-analysis/tf_idf.py index 7fc3aba80..90e165085 100644 --- a/processors/text-analysis/tf_idf.py +++ b/processors/text-analysis/tf_idf.py @@ -9,6 +9,7 @@ from common.lib.helpers import UserInput, convert_to_int from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from sklearn.feature_extraction.text import TfidfVectorizer from gensim.models import TfidfModel @@ -105,15 +106,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on token sets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "tokenise-posts" + # Allow processor on token sets + compatibility = Compatibility(types={"tokenise-posts"}) def process(self): """ diff --git a/processors/text-analysis/top_vectors.py b/processors/text-analysis/top_vectors.py index 0a971c7a3..2eac313cd 100644 --- a/processors/text-analysis/top_vectors.py +++ b/processors/text-analysis/top_vectors.py @@ -7,6 +7,7 @@ from common.lib.helpers import UserInput, convert_to_int from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -52,15 +53,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: }, } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on token vectors - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "vectorise-tokens" + # Allow processor on token vectors + compatibility = Compatibility(types={"vectorise-tokens"}) def process(self): """ diff --git a/processors/text-analysis/topic_modeling.py b/processors/text-analysis/topic_modeling.py index b41e55422..73348c618 100644 --- a/processors/text-analysis/topic_modeling.py +++ b/processors/text-analysis/topic_modeling.py @@ -4,6 +4,7 @@ from common.lib.helpers import UserInput from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException import json @@ -85,15 +86,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on token sets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "tokenise-posts" + # Allow processor on token sets + compatibility = Compatibility(types={"tokenise-posts"}) def process(self): """ diff --git a/processors/text-analysis/topic_words.py b/processors/text-analysis/topic_words.py index 40564a3fc..8b2c731ab 100644 --- a/processors/text-analysis/topic_words.py +++ b/processors/text-analysis/topic_words.py @@ -4,6 +4,7 @@ from common.lib.helpers import UserInput from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException import pickle @@ -48,15 +49,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on topic models - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "topic-modeller" + # Allow processor on topic models + compatibility = Compatibility(types={"topic-modeller"}) def process(self): """ diff --git a/processors/text-analysis/vectorise.py b/processors/text-analysis/vectorise.py index b9f854f59..c32b815b5 100644 --- a/processors/text-analysis/vectorise.py +++ b/processors/text-analysis/vectorise.py @@ -6,6 +6,7 @@ import itertools from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -24,15 +25,8 @@ class Vectorise(BasicProcessor): followups = ["vector-ranker"] - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on token sets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "tokenise-posts" + # Allow processor on token sets + compatibility = Compatibility(types={"tokenise-posts"}) def process(self): """ diff --git a/processors/text-analysis/vectorise_by_cat.py b/processors/text-analysis/vectorise_by_cat.py index e7380ce16..47d098869 100644 --- a/processors/text-analysis/vectorise_by_cat.py +++ b/processors/text-analysis/vectorise_by_cat.py @@ -6,6 +6,7 @@ import pickle from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Dale Wahl" @@ -25,14 +26,8 @@ class VectoriseByCategory(BasicProcessor): followups = ["wordcloud", "render-graphs-isometric", "render-rankflow"] - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on token sets - - :param module: Module to determine compatibility with - """ - return module.type == "tokenise-posts" + # Allow processor on token sets + compatibility = Compatibility(types={"tokenise-posts"}) @classmethod def get_options(cls, parent_dataset=None, config=None): diff --git a/processors/twitter/mention_export.py b/processors/twitter/mention_export.py index d8f6723ef..834ee9287 100644 --- a/processors/twitter/mention_export.py +++ b/processors/twitter/mention_export.py @@ -4,6 +4,7 @@ import csv from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorException, ProcessorInterruptedException __author__ = "Dale Wahl" @@ -22,15 +23,8 @@ class TwitterMentionsExport(BasicProcessor): description = "Identifies mentions types and creates mentions table (tweet id, from author id, from username, to user id, to username, mention type)" # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["twitterv2-search"] + # Allow processor on Twitter/X (API v2) datasets + compatibility = Compatibility(types={"twitterv2-search"}) def process(self): """ @@ -150,15 +144,8 @@ class TCATMentionsExport(BasicProcessor): description = "Identifies mentions types and creates mentions table (tweet id, from author id, from username, to username)" # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager config: Configuration reader (context-aware) - """ - return module.type in ["dmi-tcat-search"] + # Allow processor on imported TCAT datasets + compatibility = Compatibility(types={"dmi-tcat-search"}) def process(self): """ diff --git a/processors/twitter/user_visibility.py b/processors/twitter/user_visibility.py index 045b8e446..2aade0846 100644 --- a/processors/twitter/user_visibility.py +++ b/processors/twitter/user_visibility.py @@ -5,6 +5,7 @@ from common.lib.helpers import get_interval_descriptor from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException from common.lib.user_input import UserInput @@ -45,15 +46,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["twitterv2-search", "dmi-tcat-search"] + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) def process(self): """ diff --git a/processors/visualisation/download_tiktok.py b/processors/visualisation/download_tiktok.py index aed561930..f43eab457 100644 --- a/processors/visualisation/download_tiktok.py +++ b/processors/visualisation/download_tiktok.py @@ -16,6 +16,7 @@ from datasources.tiktok.search_tiktok import SearchTikTok as SearchTikTokByImport from processors.visualisation.download_images import ImageDownloader from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" @@ -82,15 +83,8 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor TikTok datasets - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager config: Configuration reader (context-aware) - """ - return module.type in ["tiktok-search", "tiktok-urls-search"] + # Allow processor on TikTok datasets + compatibility = Compatibility(types={"tiktok-search", "tiktok-urls-search"}) def process(self): """ diff --git a/processors/visualisation/histwords.py b/processors/visualisation/histwords.py index 11baa85e5..3cbe90731 100644 --- a/processors/visualisation/histwords.py +++ b/processors/visualisation/histwords.py @@ -12,6 +12,7 @@ from gensim.models import KeyedVectors from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput, convert_to_int, get_4cat_canvas, convert_to_float from common.lib.exceptions import ProcessorInterruptedException @@ -104,15 +105,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on token sets - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "generate-embeddings" + # Allow processor on word embedding models + compatibility = Compatibility(types={"generate-embeddings"}) def process(self): # parse parameters diff --git a/processors/visualisation/video_hasher.py b/processors/visualisation/video_hasher.py index ae4a84aa6..d7e8ceaf8 100644 --- a/processors/visualisation/video_hasher.py +++ b/processors/visualisation/video_hasher.py @@ -16,6 +16,7 @@ from backend.lib.processor import BasicProcessor from backend.lib.preset import ProcessorAdvancedPreset +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException, ProcessorException from common.lib.user_input import UserInput @@ -370,12 +371,8 @@ def get_options(cls, parent_dataset=None, config=None): "max": 100 }} - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on video hasher - """ - return module.type in ["video-hasher-1"] + # Allow on video hasher + compatibility = Compatibility(types={"video-hasher-1"}) def process(self): """ @@ -488,12 +485,8 @@ def get_options(cls, parent_dataset=None, config=None): "max": 100 }} - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on video hasher - """ - return module.type in ["video-hasher-1"] + # Allow on video hasher + compatibility = Compatibility(types={"video-hasher-1"}) def process(self): """ diff --git a/processors/visualisation/video_timelines.py b/processors/visualisation/video_timelines.py index a57ff2ea7..7c6a3fcc2 100644 --- a/processors/visualisation/video_timelines.py +++ b/processors/visualisation/video_timelines.py @@ -14,6 +14,7 @@ from ural import is_url from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException from common.lib.user_input import UserInput from common.lib.helpers import get_4cat_canvas @@ -62,20 +63,9 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine compatibility - - Compatible with 'Extract video frames'. Can in principle run on - anything that stores related images in separate folders in a zip - archive. Each folder will be rendered as a separate timeline. - - :param str module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.type in ["video-frames", "video-scene-frames"] + # Compatible with extracted video frames (or anything that stores related + # images in separate folders within a zip archive). + compatibility = Compatibility(types={"video-frames", "video-scene-frames"}) def process(self): metadata = {} diff --git a/processors/visualisation/youtube_imagewall.py b/processors/visualisation/youtube_imagewall.py index 96285aa08..f205c4478 100644 --- a/processors/visualisation/youtube_imagewall.py +++ b/processors/visualisation/youtube_imagewall.py @@ -10,6 +10,7 @@ from PIL import Image, ImageOps, ImageDraw, ImageFont from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput, convert_to_int __author__ = "Sal Hagen" @@ -56,15 +57,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on YouTube thumbnail sets - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "youtube-thumbnails" + # Allow processor on YouTube thumbnail sets + compatibility = Compatibility(types={"youtube-thumbnails"}) def process(self): """ From 76533fbc679c2178e2da24ffc72c39ab23a77fde Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 16 Jun 2026 11:27:08 +0200 Subject: [PATCH 06/30] base_twitter_stats: compatibility with abstract class, cannot overwrite is_compatible_with as it would be inherited, instead compatible w/ empty set() i.e. nothing --- processors/twitter/base_twitter_stats.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/processors/twitter/base_twitter_stats.py b/processors/twitter/base_twitter_stats.py index e9d58930f..89cd7c471 100644 --- a/processors/twitter/base_twitter_stats.py +++ b/processors/twitter/base_twitter_stats.py @@ -6,6 +6,7 @@ from common.lib.helpers import pad_interval, get_interval_descriptor from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorException, ProcessorInterruptedException __author__ = "Dale Wahl" @@ -27,15 +28,9 @@ class TwitterStatsBase(BasicProcessor): sorted = False - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return False + # Abstract base for the Twitter statistics processors; not runnable on its + # own (an empty type set never matches a dataset). + compatibility = Compatibility(types=set()) def process(self): """ From f19adf966d7ed291765a960cb525792899ffb9f7 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 16 Jun 2026 11:27:40 +0200 Subject: [PATCH 07/30] is compatible w twitter stats subclasses --- processors/twitter/aggregate_stats.py | 12 +++--------- processors/twitter/custom_stats.py | 12 +++--------- processors/twitter/hashtag_stats.py | 12 +++--------- processors/twitter/identical_tweets.py | 12 +++--------- processors/twitter/source_stats.py | 12 +++--------- processors/twitter/twitter_stats.py | 12 +++--------- processors/twitter/user_stats_individual.py | 12 +++--------- 7 files changed, 21 insertions(+), 63 deletions(-) diff --git a/processors/twitter/aggregate_stats.py b/processors/twitter/aggregate_stats.py index f1b0c667b..7c84555ee 100644 --- a/processors/twitter/aggregate_stats.py +++ b/processors/twitter/aggregate_stats.py @@ -7,6 +7,7 @@ from common.lib.helpers import UserInput, pad_interval, get_interval_descriptor from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorException __author__ = "Dale Wahl" @@ -66,15 +67,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["twitterv2-search", "dmi-tcat-search"] + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) def trim_mean(self, values, cut_pct): """ diff --git a/processors/twitter/custom_stats.py b/processors/twitter/custom_stats.py index 0e4d24836..f6425e12d 100644 --- a/processors/twitter/custom_stats.py +++ b/processors/twitter/custom_stats.py @@ -4,6 +4,7 @@ from common.lib.exceptions import ProcessorException from common.lib.helpers import UserInput from processors.twitter.base_twitter_stats import TwitterStatsBase +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -56,15 +57,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: }, } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["twitterv2-search", "dmi-tcat-search"] + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) def map_data(self, post): """ diff --git a/processors/twitter/hashtag_stats.py b/processors/twitter/hashtag_stats.py index df41a3ee5..9eb6bf214 100644 --- a/processors/twitter/hashtag_stats.py +++ b/processors/twitter/hashtag_stats.py @@ -3,6 +3,7 @@ """ from common.lib.helpers import UserInput from processors.twitter.base_twitter_stats import TwitterStatsBase +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -52,15 +53,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: # } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["twitterv2-search", "dmi-tcat-search"] + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) def map_data(self, post): """ diff --git a/processors/twitter/identical_tweets.py b/processors/twitter/identical_tweets.py index 5e83c734a..2783b333c 100644 --- a/processors/twitter/identical_tweets.py +++ b/processors/twitter/identical_tweets.py @@ -3,6 +3,7 @@ """ from common.lib.helpers import UserInput from processors.twitter.base_twitter_stats import TwitterStatsBase +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -43,15 +44,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: }, } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["twitterv2-search", "dmi-tcat-search"] + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) def map_data(self, post): """ diff --git a/processors/twitter/source_stats.py b/processors/twitter/source_stats.py index e74071100..6e1a8b8af 100644 --- a/processors/twitter/source_stats.py +++ b/processors/twitter/source_stats.py @@ -3,6 +3,7 @@ """ from common.lib.helpers import UserInput from processors.twitter.base_twitter_stats import TwitterStatsBase +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -52,15 +53,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: # } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["twitterv2-search", "dmi-tcat-search"] + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) def map_data(self, post): """ diff --git a/processors/twitter/twitter_stats.py b/processors/twitter/twitter_stats.py index 4b383023e..90a9e3dbe 100644 --- a/processors/twitter/twitter_stats.py +++ b/processors/twitter/twitter_stats.py @@ -3,6 +3,7 @@ """ from common.lib.helpers import UserInput from processors.twitter.base_twitter_stats import TwitterStatsBase +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -47,15 +48,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["twitterv2-search", "dmi-tcat-search"] + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) def map_data(self, post): """ diff --git a/processors/twitter/user_stats_individual.py b/processors/twitter/user_stats_individual.py index f79750f5a..cc25f6561 100644 --- a/processors/twitter/user_stats_individual.py +++ b/processors/twitter/user_stats_individual.py @@ -3,6 +3,7 @@ """ from common.lib.helpers import UserInput from processors.twitter.base_twitter_stats import TwitterStatsBase +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorException __author__ = "Dale Wahl" @@ -52,15 +53,8 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: # } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["twitterv2-search", "dmi-tcat-search"] + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) def map_data(self, post): """ From 0ddfdde18ce77407437ce6a4ef0f2553fcb0903d Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 16 Jun 2026 12:35:05 +0200 Subject: [PATCH 08/30] move compatibility, fold in followups --- processors/conversion/twitter_ndjson_to_tcat_json.py | 5 +---- processors/conversion/vision_api_to_csv.py | 6 +++--- processors/metrics/group_hashes.py | 8 ++++---- processors/networks/clarifai_bipartite_network.py | 6 +++--- .../networks/google_vision_bipartite_network.py | 6 +++--- processors/networks/google_vision_network.py | 6 +++--- processors/networks/hash_similarity_network.py | 6 +++--- processors/text-analysis/collocations.py | 4 +--- processors/text-analysis/documents_per_topic.py | 2 -- processors/text-analysis/generate_embeddings.py | 6 ++---- processors/text-analysis/post_topic_matrix.py | 6 ++---- processors/text-analysis/similar_words.py | 6 ++---- processors/text-analysis/tokenise.py | 4 +--- processors/text-analysis/top_vectors.py | 8 +++----- processors/text-analysis/topic_modeling.py | 9 ++++----- processors/text-analysis/topic_words.py | 6 ++---- processors/text-analysis/vectorise.py | 4 +--- processors/twitter/aggregate_stats.py | 6 +++--- processors/twitter/base_twitter_stats.py | 5 ++--- processors/twitter/custom_stats.py | 6 +++--- processors/twitter/hashtag_stats.py | 6 +++--- processors/twitter/identical_tweets.py | 6 +++--- processors/twitter/source_stats.py | 6 +++--- processors/twitter/twitter_stats.py | 6 +++--- processors/twitter/user_stats_individual.py | 6 +++--- processors/twitter/user_visibility.py | 6 +++--- processors/visualisation/download_tiktok.py | 6 ++---- processors/visualisation/histwords.py | 6 +++--- processors/visualisation/video_hasher.py | 12 ++++++------ processors/visualisation/video_timelines.py | 8 ++++---- processors/visualisation/youtube_imagewall.py | 6 +++--- processors/visualisation/youtube_thumbnails.py | 12 +++++------- 32 files changed, 87 insertions(+), 114 deletions(-) diff --git a/processors/conversion/twitter_ndjson_to_tcat_json.py b/processors/conversion/twitter_ndjson_to_tcat_json.py index 0df026298..44f4f0b04 100644 --- a/processors/conversion/twitter_ndjson_to_tcat_json.py +++ b/processors/conversion/twitter_ndjson_to_tcat_json.py @@ -21,10 +21,8 @@ class ConvertNDJSONToJSON(BasicProcessor): description = "Convert a Twitter dataset to a TCAT-compatible format. This file can then be uploaded to TCAT." # description displayed in UI extension = "json" # extension of result file, used internally and in UI - followups = ["tcat-auto-upload"] - # Allow processor on Twitter/X (API v2) datasets - compatibility = Compatibility(types={"twitterv2-search"}) + compatibility = Compatibility(types={"twitterv2-search"}, preferred_followups=["tcat-auto-upload"]) def process(self): """ @@ -134,7 +132,6 @@ def map_to_TCAT(self, tweet): 'geo_full' : tweet.get('geo'), } - # Retweet - TCAT checks existance of 'retweeted_status' as key to determine if tweet is a retweet # We instead search for a referenced_tweets with type 'retweeted' # This assumes only one retweet in reference tweets (which has proven true in testing) diff --git a/processors/conversion/vision_api_to_csv.py b/processors/conversion/vision_api_to_csv.py index 8bd096698..578563156 100644 --- a/processors/conversion/vision_api_to_csv.py +++ b/processors/conversion/vision_api_to_csv.py @@ -30,6 +30,9 @@ class ConvertVisionOutputToCSV(BasicProcessor): "to the original dataset.") # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow processor on Google Vision API output + compatibility = Compatibility(types={"google-vision-api"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -50,9 +53,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - # Allow processor on Google Vision API output - compatibility = Compatibility(types={"google-vision-api"}) - def process(self): """ This takes the NDJSON file as input and writes the same data as a CSV file diff --git a/processors/metrics/group_hashes.py b/processors/metrics/group_hashes.py index cb79211fd..7712aa4fa 100644 --- a/processors/metrics/group_hashes.py +++ b/processors/metrics/group_hashes.py @@ -22,6 +22,9 @@ class HashGrouper(BasicProcessor): description = "Calculate groups of similar hashes from a CSV file." # description displayed in UI extension = "csv" + # Allow processor on image-hasher output (could also work on any CSV with the right fields) + compatibility = Compatibility(types={"image-hasher"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -45,9 +48,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - # Allow processor on image-hasher output (could also work on any CSV with the right fields) - compatibility = Compatibility(types={"image-hasher"}) - @staticmethod def compute_groups(hashes, hash_type: str, hash_size: int | None, similarity_pct: float) -> list[int]: """ @@ -151,7 +151,7 @@ def process(self): for item in self.source_dataset.iterate_items(self): if self.interrupted: raise ProcessorInterruptedException("Interrupted while grouping hashes") - + row = dict(item) # Discover and enforce a single hash_type row_hash_type = row.get("hash_type") diff --git a/processors/networks/clarifai_bipartite_network.py b/processors/networks/clarifai_bipartite_network.py index 8587275ae..35bbdcd2f 100644 --- a/processors/networks/clarifai_bipartite_network.py +++ b/processors/networks/clarifai_bipartite_network.py @@ -25,6 +25,9 @@ class VisionTagBiPartiteNetworker(BasicProcessor): "labels if the label occurs for the image with that file name." extension = "gexf" # extension of result file, used internally and in UI + # Allow processor to run on Clarifai API data + compatibility = Compatibility(types={"clarifai-api"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -47,9 +50,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - # Allow processor to run on Clarifai API data - compatibility = Compatibility(types={"clarifai-api"}) - def process(self): """ Generates a GEXF file-annotation graph. diff --git a/processors/networks/google_vision_bipartite_network.py b/processors/networks/google_vision_bipartite_network.py index e281ee0bb..aef0f3c19 100644 --- a/processors/networks/google_vision_bipartite_network.py +++ b/processors/networks/google_vision_bipartite_network.py @@ -26,6 +26,9 @@ class VisionTagBiPartiteNetworker(BasicProcessor): "labels if the label occurs for the image with that file name." extension = "gexf" # extension of result file, used internally and in UI + # Allow processor to run on Google Vision API data + compatibility = Compatibility(types={"google-vision-api"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -62,9 +65,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - # Allow processor to run on Google Vision API data - compatibility = Compatibility(types={"google-vision-api"}) - def process(self): """ Generates a GEXF file-annotation graph. diff --git a/processors/networks/google_vision_network.py b/processors/networks/google_vision_network.py index dbc284df4..9ddf0da8d 100644 --- a/processors/networks/google_vision_network.py +++ b/processors/networks/google_vision_network.py @@ -26,6 +26,9 @@ class VisionTagNetworker(BasicProcessor): "edges." extension = "gexf" # extension of result file, used internally and in UI + # Allow processor to run on Google Vision API data + compatibility = Compatibility(types={"google-vision-api"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -61,9 +64,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - # Allow processor to run on Google Vision API data - compatibility = Compatibility(types={"google-vision-api"}) - def process(self): """ Generates a GDF co-annotation graph. diff --git a/processors/networks/hash_similarity_network.py b/processors/networks/hash_similarity_network.py index df63bcd54..845c91814 100644 --- a/processors/networks/hash_similarity_network.py +++ b/processors/networks/hash_similarity_network.py @@ -28,6 +28,9 @@ class HashSimilarityNetworker(BasicProcessor): description = "Calculate similarity of hashes and create a GEXF network file. Can identify near duplicate hashes." extension = "gexf" + # Currently only allowed on video-hashes, though any row of bit hashes would work. + compatibility = Compatibility(types={"video-hashes"}) + @classmethod def get_options(cls, parent_dataset=None, config=None): """ @@ -68,9 +71,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - # Currently only allowed on video-hashes, though any row of bit hashes would work. - compatibility = Compatibility(types={"video-hashes"}) - def process(self): """ Takes a list of bit hashes and compares them. Then makes network file. diff --git a/processors/text-analysis/collocations.py b/processors/text-analysis/collocations.py index 179d21731..147608334 100644 --- a/processors/text-analysis/collocations.py +++ b/processors/text-analysis/collocations.py @@ -23,10 +23,8 @@ class GetCollocations(BasicProcessor): description = "Extracts words appearing close to each other from a set of tokens." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["preset-coword-network", "wordcloud"] - # Allow processor on token sets - compatibility = Compatibility(types={"tokenise-posts"}) + compatibility = Compatibility(types={"tokenise-posts"}, preferred_followups=["preset-coword-network", "wordcloud"]) @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: diff --git a/processors/text-analysis/documents_per_topic.py b/processors/text-analysis/documents_per_topic.py index 75bf3bf90..80375c339 100644 --- a/processors/text-analysis/documents_per_topic.py +++ b/processors/text-analysis/documents_per_topic.py @@ -25,8 +25,6 @@ class TopicModelWordExtractor(BasicProcessor): description = "Uses the LDA model to predict to which topic each item or sentence belongs and counts as belonging to whichever topic has the highest probability." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = [] - # Allow processor on topic models compatibility = Compatibility(types={"topic-modeller"}) diff --git a/processors/text-analysis/generate_embeddings.py b/processors/text-analysis/generate_embeddings.py index d0159aa1f..ea352895b 100644 --- a/processors/text-analysis/generate_embeddings.py +++ b/processors/text-analysis/generate_embeddings.py @@ -33,7 +33,8 @@ class GenerateWordEmbeddings(BasicProcessor): "Note that good models require a lot of data." # description displayed in UI extension = "zip" # extension of result file, used internally and in UI - followups = ["similar-word2vec", "histwords-vectspace"] + # Allow processor on token sets + compatibility = Compatibility(types={"tokenise-posts"}, preferred_followups=["similar-word2vec", "histwords-vectspace"]) references = [ "word2vec: [Mikolov, Tomas, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. 2013. “Distributed Representations of Words and Phrases and Their Compositionality.” 8Advances in Neural Information Processing Systems*, 2013: 3111-3119.](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)", @@ -115,9 +116,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - # Allow processor on token sets - compatibility = Compatibility(types={"tokenise-posts"}) - def process(self): """ This takes a 4CAT results file as input, and outputs a number of files containing diff --git a/processors/text-analysis/post_topic_matrix.py b/processors/text-analysis/post_topic_matrix.py index 78d452b67..190abcef6 100644 --- a/processors/text-analysis/post_topic_matrix.py +++ b/processors/text-analysis/post_topic_matrix.py @@ -30,7 +30,8 @@ class TopicModelWordExtractor(BasicProcessor): "by multiple rows (for each sentence and/or column used).") # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = [] + # Allow processor on topic models + compatibility = Compatibility(types={"topic-modeller"}) @classmethod def get_options(cls, parent_dataset=None, config=None): @@ -76,9 +77,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - # Allow processor on topic models - compatibility = Compatibility(types={"topic-modeller"}) - def process(self): """ Extracts metadata and connects to original dataset diff --git a/processors/text-analysis/similar_words.py b/processors/text-analysis/similar_words.py index 5dbdfb2f6..7db6573c6 100644 --- a/processors/text-analysis/similar_words.py +++ b/processors/text-analysis/similar_words.py @@ -26,7 +26,8 @@ class SimilarWord2VecWords(BasicProcessor): description = "Uses a word2vec model to find words used in a similar context" # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["wordcloud"] + # Allow processor on word embedding models + compatibility = Compatibility(types={"generate-embeddings"}, preferred_followups=["wordcloud"]) flawless = True @@ -68,9 +69,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - # Allow processor on word embedding models - compatibility = Compatibility(types={"generate-embeddings"}) - def process(self): """ This takes previously generated Word2Vec models and uses them to find diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py index c2d2c183c..c95fd8928 100644 --- a/processors/text-analysis/tokenise.py +++ b/processors/text-analysis/tokenise.py @@ -37,7 +37,7 @@ class Tokenise(BasicProcessor): "tokens per sentence." # description displayed in UI extension = "zip" # extension of result file, used internally and in UI - followups = ["collocations", "vectorise-tokens", "generate-embeddings", "tfidf", "topic-modeller", ] + compatibility = Compatibility(extensions={"csv", "ndjson"}, preferred_followups=["collocations", "vectorise-tokens", "generate-embeddings", "tfidf", "topic-modeller", ]) references = [ "[NLTK tokenizer documentation](https://www.nltk.org/api/nltk.tokenize.html)", @@ -48,8 +48,6 @@ class Tokenise(BasicProcessor): "[Words in OpenTaal word list](https://github.com/OpenTaal/opentaal-wordlist)" ] - compatibility = Compatibility(extensions={"csv", "ndjson"}) - @classmethod def get_options(cls, parent_dataset=None, config=None): """ diff --git a/processors/text-analysis/top_vectors.py b/processors/text-analysis/top_vectors.py index 2eac313cd..a04d33d3c 100644 --- a/processors/text-analysis/top_vectors.py +++ b/processors/text-analysis/top_vectors.py @@ -25,7 +25,8 @@ class VectorRanker(BasicProcessor): "Limited to 100 most-used tokens." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["wordcloud"] + # Allow processor on token vectors + compatibility = Compatibility(types={"vectorise-tokens"}, preferred_followups=["wordcloud"]) @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: @@ -53,9 +54,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: }, } - # Allow processor on token vectors - compatibility = Compatibility(types={"vectorise-tokens"}) - def process(self): """ Reads vector set and creates a CSV with ranked vectors @@ -102,7 +100,7 @@ def file_to_timestamp(file): vectors = vector_unpacker.load(binary_tokens) vectors = sorted(vectors, key=lambda x: x[1], reverse=True) - + # for overall ranking we need the full vector space per interval # because maybe an overall top-ranking vector is at the bottom # in this particular interval - we'll truncate the top list at diff --git a/processors/text-analysis/topic_modeling.py b/processors/text-analysis/topic_modeling.py index 73348c618..d9070ab67 100644 --- a/processors/text-analysis/topic_modeling.py +++ b/processors/text-analysis/topic_modeling.py @@ -32,12 +32,14 @@ class TopicModeler(BasicProcessor): "which can be used to find clusters of related words." # description displayed in UI extension = "zip" # extension of result file, used internally and in UI - followups = ["document_count", "document_topic_matrix", "topic-model-words"] + # Allow processor on token sets + compatibility = Compatibility(types={"tokenise-posts"}, preferred_followups=["document_count", "document_topic_matrix", "topic-model-words"]) + references = [ 'Blei, David M., Andrew Y. Ng, and Michael I. Jordan (2003). "Latent dirichlet allocation." the *Journal of machine Learning research* 3: 993-1022.', 'Blei, David M. (2003). "Topic Modeling and Digital Humanities." *Journal of Digital Humanities* 2(1).' ] - + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -86,9 +88,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - # Allow processor on token sets - compatibility = Compatibility(types={"tokenise-posts"}) - def process(self): """ Unzips token sets and builds topic models for each one. Model data is diff --git a/processors/text-analysis/topic_words.py b/processors/text-analysis/topic_words.py index 8b2c731ab..91d37d382 100644 --- a/processors/text-analysis/topic_words.py +++ b/processors/text-analysis/topic_words.py @@ -25,7 +25,8 @@ class TopicModelWordExtractor(BasicProcessor): description = "Creates a CSV file with the top tokens (words) per topic in the generated topic model, and their associated weights." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["wordcloud"] + # Allow processor on topic models + compatibility = Compatibility(types={"topic-modeller"}, preferred_followups=["wordcloud"]) @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: @@ -49,9 +50,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - # Allow processor on topic models - compatibility = Compatibility(types={"topic-modeller"}) - def process(self): """ Extracts topics per model and top associated words diff --git a/processors/text-analysis/vectorise.py b/processors/text-analysis/vectorise.py index c32b815b5..0b591afba 100644 --- a/processors/text-analysis/vectorise.py +++ b/processors/text-analysis/vectorise.py @@ -23,10 +23,8 @@ class Vectorise(BasicProcessor): description = "Counts how often a token appears in the dataset. This creates a bag of words." # description displayed in UI extension = "zip" # extension of result file, used internally and in UI - followups = ["vector-ranker"] - # Allow processor on token sets - compatibility = Compatibility(types={"tokenise-posts"}) + compatibility = Compatibility(types={"tokenise-posts"}, preferred_followups=["vector-ranker"]) def process(self): """ diff --git a/processors/twitter/aggregate_stats.py b/processors/twitter/aggregate_stats.py index 7c84555ee..c07cc9aac 100644 --- a/processors/twitter/aggregate_stats.py +++ b/processors/twitter/aggregate_stats.py @@ -26,6 +26,9 @@ class TwitterAggregatedStats(BasicProcessor): description = "Group tweets by category and count tweets per timeframe and then calculate aggregate group statistics (i.e. min, max, average, Q1, median, Q3, and trimmed mean): number of tweets, urls, hashtags, mentions, etc. \nUse for example to find the distribution of the number of tweets per author and compare across time." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) + num_of_different_categories = None @classmethod @@ -67,9 +70,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - # Allow processor on Twitter/X datasets (API v2 or imported TCAT) - compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) - def trim_mean(self, values, cut_pct): """ Return mean of array after trimming a specified fraction of extreme values diff --git a/processors/twitter/base_twitter_stats.py b/processors/twitter/base_twitter_stats.py index 89cd7c471..ab3d37e0a 100644 --- a/processors/twitter/base_twitter_stats.py +++ b/processors/twitter/base_twitter_stats.py @@ -25,13 +25,12 @@ class TwitterStatsBase(BasicProcessor): description = "This is a class to help other twitter classes" # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - sorted = False - - # Abstract base for the Twitter statistics processors; not runnable on its # own (an empty type set never matches a dataset). compatibility = Compatibility(types=set()) + sorted = False + def process(self): """ This takes a 4CAT twitter dataset file as input, and outputs a csv. diff --git a/processors/twitter/custom_stats.py b/processors/twitter/custom_stats.py index f6425e12d..e37cf73b9 100644 --- a/processors/twitter/custom_stats.py +++ b/processors/twitter/custom_stats.py @@ -22,6 +22,9 @@ class TwitterCustomStats(TwitterStatsBase): description = "Group tweets by category and count tweets per timeframe to collect aggregate group statistics.\nFor retweets and quotes, hashtags, mentions, URLs, and images from the original tweet are included in the retweet/quote. Data on public metrics (e.g., number of retweets or likes of tweets) are as of the time the data was collected." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) + sorted = 'Number of Tweets' @classmethod @@ -57,9 +60,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: }, } - # Allow processor on Twitter/X datasets (API v2 or imported TCAT) - compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) - def map_data(self, post): """ Maps a post to collect aggregate data. Returns a key for grouping data, a dictionary of aggregate data that can diff --git a/processors/twitter/hashtag_stats.py b/processors/twitter/hashtag_stats.py index 9eb6bf214..52bff79f1 100644 --- a/processors/twitter/hashtag_stats.py +++ b/processors/twitter/hashtag_stats.py @@ -21,6 +21,9 @@ class TwitterHashtagStats(TwitterStatsBase): description = "Lists by hashtag how many tweets contain hashtags, how many times those tweets have been retweeted/replied to/liked/quoted, and information about unique users and hashtags used alongside each hashtag.\nFor retweets and quotes, hashtags from the original tweet are included in the retweet/quote." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) + sorted = 'Number of Tweets containing Hashtag' @classmethod @@ -53,9 +56,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: # } } - # Allow processor on Twitter/X datasets (API v2 or imported TCAT) - compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) - def map_data(self, post): """ Maps a post to collect aggregate data. Returns a key for grouping data, a dictionary of aggregate data that can diff --git a/processors/twitter/identical_tweets.py b/processors/twitter/identical_tweets.py index 2783b333c..9dc923fe5 100644 --- a/processors/twitter/identical_tweets.py +++ b/processors/twitter/identical_tweets.py @@ -21,6 +21,9 @@ class TwitterIdenticalTweets(TwitterStatsBase): description = "Groups tweets by text and counts the number of times they have been (re)tweeted indentically." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) + sorted = 'Number of Identical Tweets' @classmethod @@ -44,9 +47,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: }, } - # Allow processor on Twitter/X datasets (API v2 or imported TCAT) - compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) - def map_data(self, post): """ Maps a post to collect aggregate data. Returns a key for grouping data, a dictionary of aggregate data that can diff --git a/processors/twitter/source_stats.py b/processors/twitter/source_stats.py index 6e1a8b8af..e5f25e00c 100644 --- a/processors/twitter/source_stats.py +++ b/processors/twitter/source_stats.py @@ -21,6 +21,9 @@ class TwitterHashtagStats(TwitterStatsBase): description = "Lists by source of tweet how many tweets contain hashtags, how many times those tweets have been retweeted/replied to/liked/quoted, and information about unique users and hashtags used alongside each hashtag.\nFor retweets and quotes, hashtags from the original tweet are included in the retweet/quote." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) + sorted = 'Number of Tweets from Source' @classmethod @@ -53,9 +56,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: # } } - # Allow processor on Twitter/X datasets (API v2 or imported TCAT) - compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) - def map_data(self, post): """ Maps a post to collect aggregate data. Returns a key for grouping data, a dictionary of aggregate data that can diff --git a/processors/twitter/twitter_stats.py b/processors/twitter/twitter_stats.py index 90a9e3dbe..4fe18d6df 100644 --- a/processors/twitter/twitter_stats.py +++ b/processors/twitter/twitter_stats.py @@ -21,6 +21,9 @@ class TwitterStats(TwitterStatsBase): description = "Contains the number of tweets, number of tweets with links, number of tweets with hashtags, number of tweets with mentions, number of retweets, and number of replies" # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -48,9 +51,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - # Allow processor on Twitter/X datasets (API v2 or imported TCAT) - compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) - def map_data(self, post): """ Maps a post to collect aggregate data. Returns a key for grouping data, a dictionary of sum data that can diff --git a/processors/twitter/user_stats_individual.py b/processors/twitter/user_stats_individual.py index cc25f6561..ee9cfdf91 100644 --- a/processors/twitter/user_stats_individual.py +++ b/processors/twitter/user_stats_individual.py @@ -22,6 +22,9 @@ class TwitterStats(TwitterStatsBase): description = "Lists users and their number of tweets, number of followers, number of friends, how many times they are listed, their UTC time offset, whether the user has a verified account and how many times they appear in the data set." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) + sorted = "Tweets (in interval)" @classmethod @@ -53,9 +56,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: # } } - # Allow processor on Twitter/X datasets (API v2 or imported TCAT) - compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) - def map_data(self, post): """ Maps a post to collect aggregate data. Returns a key for grouping data, a dictionary of aggregate data that can diff --git a/processors/twitter/user_visibility.py b/processors/twitter/user_visibility.py index 2aade0846..36f458a15 100644 --- a/processors/twitter/user_visibility.py +++ b/processors/twitter/user_visibility.py @@ -25,6 +25,9 @@ class TwitterUserVisibility(BasicProcessor): description = "Collects usernames and totals how many tweets are authored by the user and how many tweets mention the user" # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -46,9 +49,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - # Allow processor on Twitter/X datasets (API v2 or imported TCAT) - compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) - def process(self): """ This takes a 4CAT twitter dataset file as input, and outputs a csv. diff --git a/processors/visualisation/download_tiktok.py b/processors/visualisation/download_tiktok.py index f43eab457..158637da1 100644 --- a/processors/visualisation/download_tiktok.py +++ b/processors/visualisation/download_tiktok.py @@ -32,7 +32,8 @@ class TikTokImageDownloader(BasicProcessor): extension = "zip" media_type = "image" - followups = ImageDownloader.followups + # Allow processor on TikTok datasets + compatibility = Compatibility(types={"tiktok-search", "tiktok-urls-search"}, preferred_followups=ImageDownloader.followups) @classmethod def get_options(cls, parent_dataset=None, config=None): @@ -83,9 +84,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - # Allow processor on TikTok datasets - compatibility = Compatibility(types={"tiktok-search", "tiktok-urls-search"}) - def process(self): """ Reads a file, filtering items that match in the required way, and diff --git a/processors/visualisation/histwords.py b/processors/visualisation/histwords.py index 3cbe90731..1fbe4302c 100644 --- a/processors/visualisation/histwords.py +++ b/processors/visualisation/histwords.py @@ -42,6 +42,9 @@ class HistWordsVectorSpaceVisualiser(BasicProcessor): description = "Visualise nearest neighbours of a given query across all models and show the closest neighbours per model in one combined graph. Based on the 'HistWords' algorithm by Hamilton et al." # description displayed in UI extension = "svg" # extension of result file, used internally and in UI + # Allow processor on word embedding models + compatibility = Compatibility(types={"generate-embeddings"}) + references = [ "HistWords: [Hamilton, W. L., Leskovec, J., & Jurafsky, D. (2016). Diachronic word embeddings reveal statistical laws of semantic change. *arXiv preprint** arXiv:1605.09096.](https://arxiv.org/pdf/1605.09096.pdf)", "HistWords: [William L. Hamilton, Jure Leskovec, and Dan Jurafsky. HistWords: Word Embeddings for Historical Text](https://nlp.stanford.edu/projects/histwords/)", @@ -105,9 +108,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - # Allow processor on word embedding models - compatibility = Compatibility(types={"generate-embeddings"}) - def process(self): # parse parameters input_words = self.parameters.get("words", "") diff --git a/processors/visualisation/video_hasher.py b/processors/visualisation/video_hasher.py index d7e8ceaf8..f63120803 100644 --- a/processors/visualisation/video_hasher.py +++ b/processors/visualisation/video_hasher.py @@ -357,6 +357,9 @@ class VideoHashNetwork(BasicProcessor): description = "Creates hashes network to identify duplicate or similar videos." # description displayed in UI extension = "gexf" # extension of result file, used internally and in UI + # Allow on video hasher + compatibility = Compatibility(types={"video-hasher-1"}) + references = [ "[Video Hash](https://github.com/akamhy/videohash#readme)", ] @@ -371,9 +374,6 @@ def get_options(cls, parent_dataset=None, config=None): "max": 100 }} - # Allow on video hasher - compatibility = Compatibility(types={"video-hasher-1"}) - def process(self): """ @@ -471,6 +471,9 @@ class VideoHashSimilarities(BasicProcessor): description = "Creates CSV with hashes and groups videos above similarity value." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow on video hasher + compatibility = Compatibility(types={"video-hasher-1"}) + references = [ "[Video Hash](https://github.com/akamhy/videohash#readme)", ] @@ -485,9 +488,6 @@ def get_options(cls, parent_dataset=None, config=None): "max": 100 }} - # Allow on video hasher - compatibility = Compatibility(types={"video-hasher-1"}) - def process(self): """ diff --git a/processors/visualisation/video_timelines.py b/processors/visualisation/video_timelines.py index 7c6a3fcc2..e2036c4bc 100644 --- a/processors/visualisation/video_timelines.py +++ b/processors/visualisation/video_timelines.py @@ -39,6 +39,10 @@ class VideoTimelines(BasicProcessor): "collage of sequential frames). Timelines are then vertically stacked." # description displayed in UI extension = "svg" # extension of result file, used internally and in UI + # Compatible with extracted video frames (or anything that stores related + # images in separate folders within a zip archive). + compatibility = Compatibility(types={"video-frames", "video-scene-frames"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -63,10 +67,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - # Compatible with extracted video frames (or anything that stores related - # images in separate folders within a zip archive). - compatibility = Compatibility(types={"video-frames", "video-scene-frames"}) - def process(self): metadata = {} base_height = self.parameters.get("height", 100) diff --git a/processors/visualisation/youtube_imagewall.py b/processors/visualisation/youtube_imagewall.py index f205c4478..94e9c969d 100644 --- a/processors/visualisation/youtube_imagewall.py +++ b/processors/visualisation/youtube_imagewall.py @@ -33,6 +33,9 @@ class YouTubeImageWall(BasicProcessor): description = "Make an image wall from YouTube video thumbnails." # description displayed in UI extension = "png" # extension of result file, used internally and in UI + # Allow processor on YouTube thumbnail sets + compatibility = Compatibility(types={"youtube-thumbnails"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -57,9 +60,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - # Allow processor on YouTube thumbnail sets - compatibility = Compatibility(types={"youtube-thumbnails"}) - def process(self): """ Takes the thumbnails downloaded from YouTube metadata and diff --git a/processors/visualisation/youtube_thumbnails.py b/processors/visualisation/youtube_thumbnails.py index 58b955154..111cd6a43 100644 --- a/processors/visualisation/youtube_thumbnails.py +++ b/processors/visualisation/youtube_thumbnails.py @@ -19,7 +19,7 @@ class YouTubeThumbnails(BasicProcessor): """ - + Downloads YouTube thumbnails. """ @@ -31,14 +31,12 @@ class YouTubeThumbnails(BasicProcessor): extension = "zip" # extension of result file, used internally and in UI media_type = "image" # media type of the result - followups = ["youtube-imagewall"] + # Allow processor on YouTube metadata sets + compatibility = Compatibility(types={"youtube-metadata"}, preferred_followups=["youtube-imagewall"]) max_retries = 3 sleep_time = 10 - # Allow processor on YouTube metadata sets - compatibility = Compatibility(types={"youtube-metadata"}) - @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: @@ -80,7 +78,7 @@ def download_thumbnails(self, video_ids): # prepare staging area results_path = self.dataset.get_staging_area() - + api_key = self.parameters.get("key") if not api_key: api_key = self.config.get("api.youtube.key") @@ -88,7 +86,7 @@ def download_thumbnails(self, video_ids): self.dataset.finish_with_error("You need to provide a valid API key") return self.api_key = api_key - + # Use YouTubeDL and the YouTube API to request video data youtube = build("youtube", "v3", developerKey=api_key) From 500d6d12fd305e301773b79645c462e831c21d50 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 16 Jun 2026 12:36:40 +0200 Subject: [PATCH 09/30] fold in exclude processor followups into compatibility --- backend/lib/processor.py | 19 ++++++++-------- processors/text-analysis/tf_idf.py | 24 ++++++++------------ processors/text-analysis/vectorise_by_cat.py | 22 ++++++++---------- 3 files changed, 28 insertions(+), 37 deletions(-) diff --git a/backend/lib/processor.py b/backend/lib/processor.py index 369ff2019..3b5d1a4ce 100644 --- a/backend/lib/processor.py +++ b/backend/lib/processor.py @@ -1116,19 +1116,18 @@ def is_rankable(cls, multiple_items=True): @classmethod def exclude_followup_processors(cls, processor_type=None): """ - Used for processor compatibility - - To be defined by the child processor if it should exclude certain follow-up processors. - e.g.: + Determine whether a follow-up processor should be excluded. - def exclude_followup_processors(cls, processor_type): - if processor_type in ["undesirable-followup-processor"]: - return True - return False + Follow-up processors that should never be offered after this one are + listed in the `excluded_followups` field of the `compatibility` + specification. Processors with dynamic exclusion logic may override this + method instead. - :param str processor_type: Processor type to exclude - :return bool: True if processor should be excluded, False otherwise + :param str processor_type: Processor type to check + :return bool: True if the follow-up should be excluded, False otherwise """ + if cls.compatibility is not None and processor_type in cls.compatibility.excluded_followups: + return True return False @abc.abstractmethod diff --git a/processors/text-analysis/tf_idf.py b/processors/text-analysis/tf_idf.py index 90e165085..4e7378c4f 100644 --- a/processors/text-analysis/tf_idf.py +++ b/processors/text-analysis/tf_idf.py @@ -32,7 +32,15 @@ class TfIdf(BasicProcessor): description = "Get the tf-idf values of tokenised text. Works better with more documents (e.g. time-separated)." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["wordcloud"] + # Allow processor on token sets + compatibility = Compatibility( + types={"tokenise-posts"}, + preferred_followups=["wordcloud"], + excluded_followups=[ + "consolidate-urls", "preset-neologisms", "sentence-split", "tokenise-posts", + "image-downloader-stable-diffusion", "word-trees", "histogram", "extract-urls-filter", + ], + ) references = [ "[Spärck Jones, Karen. 1972. \"A statistical interpretation of term specificity and its application in retrieval.\" *Journal of Documentation* (28), 1: 11–21.](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.115.8343&rep=rep1&type=pdf)", @@ -106,9 +114,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - # Allow processor on token sets - compatibility = Compatibility(types={"tokenise-posts"}) - def process(self): """ Unzips and appends tokens to fetch and write a tf-idf matrix @@ -167,7 +172,7 @@ def process(self): if max_occurrences <= 0 or max_occurrences > len(tokens): max_occurrences = len(tokens) self.dataset.log(f"Running tf-idf with library {library}, n_size {n_size}, min_occurrences {min_occurrences}, max_occurrences {max_occurrences}, max_output {max_output}, smartirs {smartirs}") - + # Get the tf-idf matrix. self.dataset.update_status("Generating tf-idf for token set") try: @@ -300,12 +305,3 @@ def get_tfidf_sklearn(self, tokens, dates, ngram_range=(1, 1), min_occurrences=0 results.append(result) return results - - @classmethod - def exclude_followup_processors(cls, processor_type): - """ - Exclude followups if they are not compatible with the module - """ - if processor_type in ["consolidate-urls", "preset-neologisms", "sentence-split", "tokenise-posts", "image-downloader-stable-diffusion", "word-trees", "histogram", "extract-urls-filter"]: - return True - return False diff --git a/processors/text-analysis/vectorise_by_cat.py b/processors/text-analysis/vectorise_by_cat.py index 47d098869..eacf9a2a1 100644 --- a/processors/text-analysis/vectorise_by_cat.py +++ b/processors/text-analysis/vectorise_by_cat.py @@ -24,10 +24,15 @@ class VectoriseByCategory(BasicProcessor): description = "Counts all tokens per category." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["wordcloud", "render-graphs-isometric", "render-rankflow"] - # Allow processor on token sets - compatibility = Compatibility(types={"tokenise-posts"}) + compatibility = Compatibility( + types={"tokenise-posts"}, + preferred_followups=["wordcloud", "render-graphs-isometric", "render-rankflow"], + excluded_followups=[ + "consolidate-urls", "preset-neologisms", "sentence-split", "tokenise-posts", + "image-downloader-stable-diffusion", "word-trees", "histogram", "extract-urls-filter", + ], + ) @classmethod def get_options(cls, parent_dataset=None, config=None): @@ -288,13 +293,4 @@ def process(self): # Finish self.dataset.update_status("Finished") - self.dataset.finish(done) - - @classmethod - def exclude_followup_processors(cls, processor_type): - """ - Exclude followups if they are not compatible with the module - """ - if processor_type in ["consolidate-urls", "preset-neologisms", "sentence-split", "tokenise-posts", "image-downloader-stable-diffusion", "word-trees", "histogram", "extract-urls-filter"]: - return True - return False \ No newline at end of file + self.dataset.finish(done) \ No newline at end of file From 6004dbaa050be69a483af8a11b7381ff50b0a7f1 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 16 Jun 2026 14:35:57 +0200 Subject: [PATCH 10/30] compatibility: convert extension type checks --- processors/conversion/consolidate_urls.py | 16 +++----- processors/conversion/convert_text.py | 14 ++----- processors/conversion/csv_to_json.py | 13 ++----- processors/conversion/item_to_annotation.py | 14 ++----- processors/conversion/ndjson_to_csv.py | 12 ++---- processors/metrics/rank_attribute.py | 15 ++------ processors/networks/gexf_to_csv.py | 13 ++----- processors/networks/two-column-network.py | 22 ++++------- processors/presets/neologisms.py | 15 ++------ .../statistics/classification_evaluation.py | 15 ++------ processors/statistics/confusion_matrix.py | 15 ++------ .../statistics/descriptive_statistics.py | 29 ++++++--------- .../statistics/regression-evaluation.py | 37 ++++++++----------- processors/text-analysis/split_sentences.py | 15 ++------ 14 files changed, 75 insertions(+), 170 deletions(-) diff --git a/processors/conversion/consolidate_urls.py b/processors/conversion/consolidate_urls.py index 800433647..e3de3d5b6 100644 --- a/processors/conversion/consolidate_urls.py +++ b/processors/conversion/consolidate_urls.py @@ -8,6 +8,7 @@ from processors.conversion.extract_urls import ExtractURLs from common.lib.exceptions import ProcessorInterruptedException from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput, split_urls __author__ = "Dale Wahl" @@ -28,6 +29,9 @@ class ConsolidateURLs(BasicProcessor): description = "Retain only the domain (and optionally path) of URLs; used for custom networks (e.g. author + domains)" extension = "csv" + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) + # Common domain prefaces to remove domain_prefaces = ["m", "www"] # Domain dictionary (after domain_prefaces are removed) with additional rules based on URL components to conform to "clean URLs" @@ -232,7 +236,7 @@ def get_options(cls, parent_dataset=None, config=None): "requires": "method==custom" }, } - + # Get the columns for the select columns option if parent_dataset: columns = parent_dataset.get_columns() # call once @@ -247,16 +251,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - This is meant to be inherited by other child classes - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.get_extension() in ["csv", "ndjson"] - def process(self): method = self.parameters.get("method", False) url_parsing_issues = [] diff --git a/processors/conversion/convert_text.py b/processors/conversion/convert_text.py index 5afd4396c..007c245e6 100644 --- a/processors/conversion/convert_text.py +++ b/processors/conversion/convert_text.py @@ -6,6 +6,7 @@ from common.lib.exceptions import ProcessorInterruptedException from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Sal Hagen" @@ -25,6 +26,9 @@ class ConvertText(BasicProcessor): "also be added to the original dataset as annotations.") # description displayed in UI extension = "csv" + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: @@ -83,16 +87,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: k: "text" in k).pop() return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on NDJSON and CSV files - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.get_extension() in ("csv", "ndjson") - def process(self): """ Create a generator to iterate through items that can be passed to create either a csv or ndjson. diff --git a/processors/conversion/csv_to_json.py b/processors/conversion/csv_to_json.py index aef900379..418bfd05e 100644 --- a/processors/conversion/csv_to_json.py +++ b/processors/conversion/csv_to_json.py @@ -4,6 +4,7 @@ import json from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -20,16 +21,8 @@ class ConvertCSVToJSON(BasicProcessor): description = "Change a CSV file to a JSON file" # description displayed in UI extension = "json" # extension of result file, used internally and in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with a dataset or processor - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - - return module.get_extension() == "csv" + # Allow on CSV datasets + compatibility = Compatibility(extensions={"csv"}) def process(self): """ diff --git a/processors/conversion/item_to_annotation.py b/processors/conversion/item_to_annotation.py index dfe17f9b5..bd91871a6 100644 --- a/processors/conversion/item_to_annotation.py +++ b/processors/conversion/item_to_annotation.py @@ -3,6 +3,7 @@ """ from common.lib.exceptions import ProcessorInterruptedException from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Sal Hagen" @@ -23,6 +24,9 @@ class ItemToAnnotation(BasicProcessor): "Explorer. Item values must be numbers or strings.") # description displayed in UI extension = "csv" + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: @@ -43,16 +47,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on NDJSON and CSV files - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def process(self): """ :return generator: diff --git a/processors/conversion/ndjson_to_csv.py b/processors/conversion/ndjson_to_csv.py index 75b012f6e..0782e0954 100644 --- a/processors/conversion/ndjson_to_csv.py +++ b/processors/conversion/ndjson_to_csv.py @@ -6,6 +6,7 @@ from common.lib.helpers import flatten_dict from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException __author__ = "Dale Wahl" @@ -26,15 +27,8 @@ class ConvertNDJSONtoCSV(BasicProcessor): "contain nested data." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.get_extension() == "ndjson" + # Allow on NDJSON datasets + compatibility = Compatibility(extensions={"ndjson"}) def process(self): """ diff --git a/processors/metrics/rank_attribute.py b/processors/metrics/rank_attribute.py index 2f50d623a..62fc7f2a6 100644 --- a/processors/metrics/rank_attribute.py +++ b/processors/metrics/rank_attribute.py @@ -8,6 +8,7 @@ from itertools import islice, chain from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput, convert_to_int, get_interval_descriptor __author__ = "Stijn Peeters" @@ -30,23 +31,13 @@ class AttributeRanker(BasicProcessor): description = "Count values in a dataset column, like URLs or hashtags (overall or per timeframe)" # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = [] + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) references = ["[regex010](https://regex101.com/)"] include_missing_data = True - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on all csv and NDJSON datasets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - - return module.get_extension() in ("csv", "ndjson") - @classmethod def get_options(cls, parent_dataset=None, config=None): diff --git a/processors/networks/gexf_to_csv.py b/processors/networks/gexf_to_csv.py index 609bb93ae..729ca7b8b 100644 --- a/processors/networks/gexf_to_csv.py +++ b/processors/networks/gexf_to_csv.py @@ -2,6 +2,7 @@ Convert a GEXF network file to a CSV file """ from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility import networkx as nx import csv @@ -23,14 +24,8 @@ class GexfToCsv(BasicProcessor): description = "Convert a GEXF network file to a CSV spreadsheet" extension = "csv" - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on all csv and NDJSON datasets - - :param module: Module to determine compatibility with - """ - return module.get_extension() in ["gexf"] + # Allow on GEXF datasets + compatibility = Compatibility(extensions={"gexf"}) def process(self): """ @@ -58,7 +53,7 @@ def process(self): result.update({"target": target}) result.update({f"target_{k}": v for k,v in target_attributes.items()}) result.update({f"edge_{attr_key}": edge_attributes[attr_key] for attr_key in sorted(edge_attributes, key=lambda k: k == "id", reverse=True)}) - + if writer is False: # Write header # Notes: this assumes that all nodes have the same attributes which ought to be True for GEXF files written by 4CAT diff --git a/processors/networks/two-column-network.py b/processors/networks/two-column-network.py index 5217bf393..f79cb6f85 100644 --- a/processors/networks/two-column-network.py +++ b/processors/networks/two-column-network.py @@ -5,6 +5,7 @@ from functools import partial from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput, get_interval_descriptor import networkx as nx @@ -27,6 +28,9 @@ class ColumnNetworker(BasicProcessor): "(e.g. 'author' and 'subreddit'). Nodes and edges are weighted by frequency." extension = "gexf" + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) + references = [ "Utilises [Networkx](https://networkx.org/)' built-in [Louvain](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.louvain.louvain_communities.html#networkx.algorithms.community.louvain.louvain_communities) and [greedy modularity](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.modularity_max.greedy_modularity_communities.html#networkx.algorithms.community.modularity_max.greedy_modularity_communities) community detection algorithms." ] @@ -139,16 +143,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on all csv and NDJSON datasets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.get_extension() in ("csv", "ndjson") - def process(self): """ This takes a 4CAT results file as input, and creates a network file @@ -226,7 +220,7 @@ def process(self): raise ValueError(f"Date '{item.get('timestamp')}' cannot be parsed") except ValueError as e: return self.dataset.finish_with_error(f"{e}, cannot count posts per {interval_type}") - + # Track nodes per item (categoise option adjusts node name to include column if True) processed_nodes = set() @@ -256,13 +250,13 @@ def process(self): network.nodes[node_a]["intervals"][interval] += 1 processed_nodes.add(node_a) - + if node_b not in processed_nodes: if node_b not in network.nodes(): network.add_node(node_b, intervals={}, frequency=1, label=value_b, **({"category": column_b} if categorise else {})) else: network.nodes[node_b]["frequency"] += 1 - + if interval not in network.nodes[node_b]["intervals"]: network.nodes[node_b]["intervals"][interval] = 0 network.nodes[node_b]["intervals"][interval] += 1 @@ -275,7 +269,7 @@ def process(self): edge = tuple(sorted((node_a, node_b))) else: edge = (node_a, node_b) - + if edge not in processed_edges: if edge not in network.edges(): network.add_edge(node_a, node_b, intervals={}, frequency=1, weight=1) diff --git a/processors/presets/neologisms.py b/processors/presets/neologisms.py index 863c70c11..0b1e0179d 100644 --- a/processors/presets/neologisms.py +++ b/processors/presets/neologisms.py @@ -2,6 +2,7 @@ Extract neologisms """ from backend.lib.preset import ProcessorPreset +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput @@ -17,6 +18,9 @@ class NeologismExtractor(ProcessorPreset): "language data. Uses stopwords-iso as a stopword filter.") extension = "csv" + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) + references = [ "Van Soest, Jeroen. 2019. 'Language Innovation Tracker: Detecting language innovation in online discussion fora.' (MA thesis), Beuls, K. (Promotor), Van Eecke, P. (Advisor).'"] @@ -50,17 +54,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on all csv and NDJSON datasets - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - - return module.get_extension() in ("csv", "ndjson") - def get_processor_pipeline(self): """ This queues a series of post-processors to extract neologisms from a diff --git a/processors/statistics/classification_evaluation.py b/processors/statistics/classification_evaluation.py index 85fcee44c..512e2a67d 100644 --- a/processors/statistics/classification_evaluation.py +++ b/processors/statistics/classification_evaluation.py @@ -5,6 +5,7 @@ from common.lib.exceptions import ProcessorInterruptedException from common.lib.helpers import UserInput, andify from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from sklearn.preprocessing import MultiLabelBinarizer from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, cohen_kappa_score @@ -26,6 +27,9 @@ class ClassificationEvaluation(BasicProcessor): "and Cohen's Kappa). Produces overall and per-label metrics. Also supports multi-label values.") extension = "csv" # extension of result file, used internally and in UI + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: options = { @@ -100,17 +104,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: return options - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.get_extension() in ("csv", "ndjson") - def process(self): skip_empty = self.parameters.get("skip_empty", False) diff --git a/processors/statistics/confusion_matrix.py b/processors/statistics/confusion_matrix.py index 9a33e7c86..00e4c521e 100644 --- a/processors/statistics/confusion_matrix.py +++ b/processors/statistics/confusion_matrix.py @@ -4,6 +4,7 @@ from common.lib.exceptions import ProcessorInterruptedException from common.lib.helpers import UserInput from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay import matplotlib @@ -25,6 +26,9 @@ class ConfusionMatrix(BasicProcessor): description = "Create a confusion matrix with data from two columns." # description displayed in UI extension = "png" # extension of result file, used internally and in UI + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: options = { @@ -51,17 +55,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: return options - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.get_extension() in ("csv", "ndjson") - def process(self): skip_empty = self.parameters.get("skip_empty", False) diff --git a/processors/statistics/descriptive_statistics.py b/processors/statistics/descriptive_statistics.py index 01f3d4b2d..6156ecfac 100644 --- a/processors/statistics/descriptive_statistics.py +++ b/processors/statistics/descriptive_statistics.py @@ -5,6 +5,7 @@ from common.lib.exceptions import ProcessorInterruptedException from common.lib.helpers import UserInput from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility import numpy as np @@ -24,6 +25,9 @@ class DescriptiveStatistics(BasicProcessor): description = "Calculate descriptive statistics (mean, median, std dev, etc.) for numerical columns." extension = "csv" # extension of result file, used internally in UI + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: options = { @@ -51,21 +55,10 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: return options - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.get_extension() in ("csv", "ndjson") - def process(self): skip_empty = self.parameters.get("skip_empty", True) selected_columns = self.parameters.get("columns", []) - + if not selected_columns: self.dataset.finish_with_error("Please select at least one column to analyze") return @@ -87,14 +80,14 @@ def process(self): # First pass: check if we can process this row for col in selected_columns: val = item.get(col, "") - + # Handle empty values if val is None or val == "": if not skip_empty: row_valid = False break continue - + # Try to convert to float try: float_val = float(val) @@ -108,7 +101,7 @@ def process(self): return row_valid = False break - + # Second pass: add valid values to our data structure if row_valid and row_values: for col in selected_columns: @@ -122,15 +115,15 @@ def process(self): # Calculate statistics for each column results = [] - + for column in selected_columns: if not column_data[column]: self.dataset.finish_with_error(f"No valid numerical values found in column '{column}'") return - + # Convert to numpy array for calculations values = np.array(column_data[column]) - + # Calculate statistics stats = {"column": column} diff --git a/processors/statistics/regression-evaluation.py b/processors/statistics/regression-evaluation.py index 69dea0e26..09fc65da9 100644 --- a/processors/statistics/regression-evaluation.py +++ b/processors/statistics/regression-evaluation.py @@ -5,6 +5,7 @@ from common.lib.exceptions import ProcessorInterruptedException from common.lib.helpers import UserInput from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score import numpy as np @@ -25,6 +26,9 @@ class RegressionEvaluation(BasicProcessor): description = "Calculate regression metrics (MAE, MSE, R2, RMSE) between two numerical columns." extension = "csv" # extension of result file, used internally in UI + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: options = { @@ -70,25 +74,14 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: return options - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.get_extension() in ("csv", "ndjson") - def process(self): skip_empty = self.parameters.get("skip_empty", True) metrics = self.parameters.get("metrics", ["mae", "mse", "rmse", "r2"]) - + if not metrics: self.dataset.finish_with_error("Please select at least one evaluation metric") return - + # Get which metrics to calculate get_mae = "mae" in metrics get_mse = "mse" in metrics @@ -98,7 +91,7 @@ def process(self): # Parse the column names column_true = self.parameters.get("column_true", "") column_pred = self.parameters.get("column_pred", "") - + if not column_true or not column_pred: self.dataset.finish_with_error("Please specify which columns contain the true and predicted values") return @@ -122,7 +115,7 @@ def process(self): if skip_empty and (true_val is None or pred_val is None or true_val == "" or pred_val == ""): skipped_rows += 1 continue - + # Try to convert to float try: true_float = float(true_val) @@ -142,42 +135,42 @@ def process(self): if not true_values or not pred_values: self.dataset.finish_with_error("No valid numerical values found in the specified columns") return - + if len(true_values) != len(pred_values): self.dataset.finish_with_error("Mismatch in number of true and predicted values") return - + if skipped_rows > 0: self.dataset.update_status(f"Skipped {skipped_rows} rows with missing or invalid values") # Convert to numpy arrays for calculations true_array = np.array(true_values) pred_array = np.array(pred_values) - + # Calculate metrics results = [] - + if get_mae: mae = mean_absolute_error(true_array, pred_array) results.append({ "metric": "MAE", "value": round(mae, 5) }) - + if get_mse: mse = mean_squared_error(true_array, pred_array) results.append({ "metric": "MSE", "value": round(mse, 5) }) - + if get_rmse: rmse = np.sqrt(mean_squared_error(true_array, pred_array)) results.append({ "metric": "RMSE", "value": round(rmse, 5) }) - + if get_r2: r2 = r2_score(true_array, pred_array) results.append({ diff --git a/processors/text-analysis/split_sentences.py b/processors/text-analysis/split_sentences.py index 35a9016b5..e5a5cc712 100644 --- a/processors/text-analysis/split_sentences.py +++ b/processors/text-analysis/split_sentences.py @@ -6,6 +6,7 @@ from common.lib.helpers import UserInput from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -23,7 +24,8 @@ class SplitSentences(BasicProcessor): description = "Split a body of posts into discrete sentences. Output file has one row per sentence, containing the sentence and item ID." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = [] + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) @classmethod def get_options(cls, parent_dataset=None, config=None): @@ -85,17 +87,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on all csv and NDJSON datasets - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - - return module.get_extension() in ("csv", "ndjson") - def process(self): """ This takes a 4CAT results file as input, and outputs a number of files containing From 91658ae7c8007d65168d3e31db39c7de86271c4d Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 16 Jun 2026 14:36:42 +0200 Subject: [PATCH 11/30] base_filter: abstract class compatibility --- processors/filtering/base_filter.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/processors/filtering/base_filter.py b/processors/filtering/base_filter.py index 080686129..30168ccdb 100644 --- a/processors/filtering/base_filter.py +++ b/processors/filtering/base_filter.py @@ -6,6 +6,7 @@ import json from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -26,15 +27,8 @@ class BaseFilter(BasicProcessor): item_ids = [] - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - This is meant to be inherited by other child classes - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return False + # Abstract base filter; not runnable on its own (empty type set never matches) + compatibility = Compatibility(types=set()) def process(self): """ From 1168256aea574a15213a630d7cc9b1f593250955 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 16 Jun 2026 14:40:30 +0200 Subject: [PATCH 12/30] compatibility: convert top_dataset checks plus extension check --- processors/conversion/remove_author_info.py | 14 ++++---------- processors/conversion/stringify.py | 19 ++++++------------- processors/conversion/upload_annotations.py | 14 ++++---------- processors/filtering/accent_fold.py | 15 ++++----------- processors/filtering/date_filter.py | 14 ++++---------- processors/filtering/lexical_filter.py | 14 ++++---------- processors/filtering/random_filter.py | 14 ++++---------- processors/filtering/unique_filter.py | 12 +++--------- processors/metrics/count_posts.py | 15 +++------------ processors/metrics/thread_metadata.py | 15 +++------------ processors/metrics/url_titles.py | 15 +++------------ processors/metrics/vocabulary_overtime.py | 15 +++------------ processors/networks/colink_urls.py | 20 +++++++------------- processors/networks/wikipedia_network.py | 16 +++++----------- processors/presets/annotate-images.py | 15 ++++----------- processors/presets/monthly-histogram.py | 15 +++------------ processors/presets/similar-words.py | 17 ++++------------- 17 files changed, 68 insertions(+), 191 deletions(-) diff --git a/processors/conversion/remove_author_info.py b/processors/conversion/remove_author_info.py index f7df341cb..ad1e8daf1 100644 --- a/processors/conversion/remove_author_info.py +++ b/processors/conversion/remove_author_info.py @@ -10,6 +10,7 @@ import csv from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import dict_search_and_update, UserInput, HashCache __author__ = "Stijn Peeters" @@ -30,22 +31,15 @@ class AuthorInfoRemover(BasicProcessor): title = "Pseudonymise or anonymise" # title displayed in UI description = "Removes or replaces data from the dataset in fields identified as containing personal information" + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + references = [ "[What is a hash?](https://techterms.com/definition/hash)", "[What is a salt?](https://en.wikipedia.org/wiki/Salt_(cryptography))", "[What is Blake2?](https://en.wikipedia.org/wiki/BLAKE_(hash_function)#BLAKE2)" ] - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on CSV files - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ["csv", 'ndjson'] - @classmethod def get_options(cls, parent_dataset=None, config=None): options = { diff --git a/processors/conversion/stringify.py b/processors/conversion/stringify.py index 3bf1c89b1..95ed4b2d2 100644 --- a/processors/conversion/stringify.py +++ b/processors/conversion/stringify.py @@ -5,6 +5,7 @@ import string from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Sal Hagen" @@ -22,11 +23,14 @@ class Stringify(BasicProcessor): description = "Merges the data from the body column into a single text file. The result can be used for word clouds, word trees, etc." # description displayed in UI extension = "txt" # extension of result file, used internally and in UI + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ Get processor options - + :param parent_dataset DataSet: An object representing the dataset that the processor would be or was run on. Can be used, in conjunction with config, to show some options only to privileged users. @@ -57,17 +61,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility; this processor is only compatible with top datasets in CSV or NDJSON format. - - :param str module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def process(self): """ This takes a 4CAT results file as input, and outputs a plain text file @@ -86,7 +79,7 @@ def process(self): regex += "0-9" if strip_punctuation: regex += string.punctuation - + delete_regex = re.compile("[\n\t" + regex + "]") posts = 0 diff --git a/processors/conversion/upload_annotations.py b/processors/conversion/upload_annotations.py index 942bce24d..958fbfa15 100644 --- a/processors/conversion/upload_annotations.py +++ b/processors/conversion/upload_annotations.py @@ -7,6 +7,7 @@ from flask import g from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException, QueryParametersException, DataSetException from common.lib.helpers import UserInput from common.lib.dataset import DataSet @@ -29,6 +30,9 @@ class UploadAnnotations(BasicProcessor): "For CSV file uploads, comma is used as the separator. For text input, a custom separator can be specified.") extension = "csv" + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -78,16 +82,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on top-level CSV and NDJSON datasets - - :param module: Module to determine compatibility with - :param config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - @staticmethod def validate_query(query, request, config): """ diff --git a/processors/filtering/accent_fold.py b/processors/filtering/accent_fold.py index ee8d1f380..aed4de800 100644 --- a/processors/filtering/accent_fold.py +++ b/processors/filtering/accent_fold.py @@ -6,6 +6,7 @@ from unidecode import unidecode from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Stijn Peeters" @@ -27,16 +28,8 @@ class AccentFoldingFilter(BasicProcessor): "'á' to 'a', 'ç' to 'c', etc. This creates a new dataset.") extension = "csv" # extension of result file, used internally and in UI - - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on iterable files - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ["csv"] + # Allow on top-level CSV datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv"}) def process(self): """ @@ -105,7 +98,7 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: :param config ConfigManager|None config: Configuration reader (context-aware) :return dict: Options for this processor """ - + options = { "mode": { "help": "What to replace?", diff --git a/processors/filtering/date_filter.py b/processors/filtering/date_filter.py index ce8046066..b82e31ff6 100644 --- a/processors/filtering/date_filter.py +++ b/processors/filtering/date_filter.py @@ -6,6 +6,7 @@ from datetime import datetime from processors.filtering.base_filter import BaseFilter +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput from common.lib.exceptions import QueryParametersException @@ -23,6 +24,9 @@ class DateFilter(BaseFilter): category = "Filtering" # category title = "Filter by date" # title displayed in UI description = "Retains posts between given dates. This creates a new dataset." + + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: @@ -56,16 +60,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on NDJSON and CSV files - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def filter_items(self): """ Create a generator to iterate through items that can be passed to create either a csv or ndjson diff --git a/processors/filtering/lexical_filter.py b/processors/filtering/lexical_filter.py index 4fe924220..4c0129f65 100644 --- a/processors/filtering/lexical_filter.py +++ b/processors/filtering/lexical_filter.py @@ -4,6 +4,7 @@ import re from processors.filtering.base_filter import BaseFilter +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Stijn Peeters" @@ -22,6 +23,9 @@ class LexicalFilter(BaseFilter): description = "Retains posts that contain selected words or phrases, including preset word lists. " \ "This creates a new dataset." # description displayed in UI + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + references = [ "[Regex101](https://regex101.com/)" ] @@ -62,16 +66,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on NDJSON and CSV files - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def filter_items(self): """ Create a generator to iterate through items that can be passed to create either a csv or ndjson. Use diff --git a/processors/filtering/random_filter.py b/processors/filtering/random_filter.py index 48e5e66cc..5b8763fb9 100644 --- a/processors/filtering/random_filter.py +++ b/processors/filtering/random_filter.py @@ -4,6 +4,7 @@ import random from processors.filtering.base_filter import BaseFilter +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput from common.lib.exceptions import QueryParametersException @@ -22,6 +23,9 @@ class RandomFilter(BaseFilter): title = "Random sample" # title displayed in UI description = "Retain a pseudorandom set of posts. This creates a new dataset." # description displayed in UI + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -41,16 +45,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on NDJSON and CSV files - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def filter_items(self): """ Create a generator to iterate through items that can be passed to create either a csv or ndjson. Use diff --git a/processors/filtering/unique_filter.py b/processors/filtering/unique_filter.py index b87bdce34..8a82dfc47 100644 --- a/processors/filtering/unique_filter.py +++ b/processors/filtering/unique_filter.py @@ -4,6 +4,7 @@ import json from processors.filtering.base_filter import BaseFilter +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Sal Hagen" @@ -21,15 +22,8 @@ class UniqueFilter(BaseFilter): title = "Filter for unique items" # title displayed in UI description = "Only keeps the first encounter of an item. This creates a new dataset." # description displayed in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on NDJSON and CSV files - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) def filter_items(self): """ diff --git a/processors/metrics/count_posts.py b/processors/metrics/count_posts.py index f95c94a54..e30cabe1c 100644 --- a/processors/metrics/count_posts.py +++ b/processors/metrics/count_posts.py @@ -4,6 +4,7 @@ from common.lib.helpers import UserInput, pad_interval, get_interval_descriptor from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -21,18 +22,8 @@ class CountPosts(BasicProcessor): description = "Counts how many items are in the dataset per date (or overall)." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["histogram"] - - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}, preferred_followups=["histogram"]) @classmethod def get_options(cls, parent_dataset=None, config=None): diff --git a/processors/metrics/thread_metadata.py b/processors/metrics/thread_metadata.py index d2f0f8747..86b487d5a 100644 --- a/processors/metrics/thread_metadata.py +++ b/processors/metrics/thread_metadata.py @@ -5,6 +5,7 @@ import math from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Sal Hagen" __credits__ = ["Sal Hagen"] @@ -25,18 +26,8 @@ class ThreadMetadata(BasicProcessor): ) # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = [] - - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) def process(self): """ diff --git a/processors/metrics/url_titles.py b/processors/metrics/url_titles.py index 13c9c90a1..a50abec33 100644 --- a/processors/metrics/url_titles.py +++ b/processors/metrics/url_titles.py @@ -4,6 +4,7 @@ import csv from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from backend.lib.proxied_requests import FailedProxiedRequest from common.lib.helpers import UserInput from common.lib.exceptions import ProcessorInterruptedException @@ -31,7 +32,8 @@ class URLFetcher(BasicProcessor): "each URL, optionally following HTTP redirects.") # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = [] + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) config = { "url-metadata.timeout": { @@ -43,17 +45,6 @@ class URLFetcher(BasicProcessor): } } - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - @classmethod def get_options(cls, parent_dataset=None, config=None): """ diff --git a/processors/metrics/vocabulary_overtime.py b/processors/metrics/vocabulary_overtime.py index 8aa5e634a..eae9b2712 100644 --- a/processors/metrics/vocabulary_overtime.py +++ b/processors/metrics/vocabulary_overtime.py @@ -4,6 +4,7 @@ import re from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput, get_interval_descriptor __author__ = "Stijn Peeters" @@ -22,7 +23,8 @@ class OvertimeAnalysis(BasicProcessor): description = "Determines the counts over time of particular set of words or phrases." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = [] + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) references = [ "[\"Salvaging the Internet Hate Machine: Using the discourse of radical online subcultures to identify emergent extreme speech\" - Unblished paper detailing the OILab extreme speech lexigon](https://oilab.eu/texts/4CAT_Hate_Speech_WebSci_paper.pdf)", @@ -78,17 +80,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def process(self): """ Reads a CSV file, counts occurrences of chosen values over all posts, diff --git a/processors/networks/colink_urls.py b/processors/networks/colink_urls.py index 92335c2d4..ff192d150 100644 --- a/processors/networks/colink_urls.py +++ b/processors/networks/colink_urls.py @@ -8,6 +8,7 @@ import psutil from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException from common.lib.helpers import UserInput @@ -33,6 +34,9 @@ class URLCoLinker(BasicProcessor): "Edges are weighted by amount of co-links." # description displayed in UI extension = "gexf" # extension of result file, used internally and in UI + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -67,16 +71,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on top datasets. - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def process(self): """ This takes a 4CAT results file as input, and outputs a new CSV file @@ -87,7 +81,7 @@ def process(self): if self.parameters.get("level") == "thread" and "thread_id" not in self.source_dataset.get_columns(): self.dataset.finish_with_error("Thread-level co-linking requires a 'thread_id' column in the dataset.") return - + # we use these to extract URLs and host names if needed link_regex = re.compile(r"https?://[^\s\]()]+") www_regex = re.compile(r"^www\.") @@ -177,7 +171,7 @@ def process(self): self.dataset.update_status(f"Network has {len(network.nodes)} and {len(network.edges)} edges") self.dataset.log(f"time elapsed: {time.time() - start_time:.2f} seconds") self.dataset.update_status("Writing network file") - + writer = multiprocessing.Process(target=_write_gexf, args=(network, self.dataset.get_results_path())) writer.start() while writer.is_alive(): @@ -195,7 +189,7 @@ def process(self): self.dataset.finish_with_error("Network write failed") self.log.warning(f"Network writer exited with code {writer.exitcode} for dataset {self.dataset.key}") return - + self.dataset.log(f"time to complete: {time.time() - start_time:.2f} seconds") self.dataset.finish(len(network.nodes)) diff --git a/processors/networks/wikipedia_network.py b/processors/networks/wikipedia_network.py index 334ef304e..e69a00045 100644 --- a/processors/networks/wikipedia_network.py +++ b/processors/networks/wikipedia_network.py @@ -9,6 +9,7 @@ import networkx as nx from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException __author__ = "Stijn Peeters" @@ -27,15 +28,8 @@ class WikiURLCoLinker(BasicProcessor): description = "Create a GEXF network file comprised network comprised of linked-to Wikipedia pages, linked to the categories they are part of. English Wikipedia only. Will only fetch the first 10,000 links." # description displayed in UI extension = "gexf" # extension of result file, used internally and in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on top datasets. - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) def process(self): """ @@ -66,7 +60,7 @@ def process(self): if not post["body"]: continue - + wiki_links = link_regex.findall(post["body"]) # if the result has an explicit url per post, take that into @@ -147,7 +141,7 @@ def stringify_children(node): # Add " (cat)" to the category strings. # This is needed because pages can sometimes have the same name as the category. # This will result in a faulty graph, since there's duplicate nodes. - + category += " (cat)" if category not in all_categories: diff --git a/processors/presets/annotate-images.py b/processors/presets/annotate-images.py index b8650e698..829c5d8ac 100644 --- a/processors/presets/annotate-images.py +++ b/processors/presets/annotate-images.py @@ -2,6 +2,7 @@ Annotate top images """ from backend.lib.preset import ProcessorPreset +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput, convert_to_int @@ -17,6 +18,9 @@ class AnnotateImages(ProcessorPreset): "this is a paid service and will count towards your API credit." extension = "csv" + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + references = [ "[Google Vision API Documentation](https://cloud.google.com/vision/docs)", "[Google Vision API Pricing & Free Usage Limits](https://cloud.google.com/vision/pricing)" @@ -67,17 +71,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def get_processor_pipeline(self): """ This queues a series of post-processors to annotate images diff --git a/processors/presets/monthly-histogram.py b/processors/presets/monthly-histogram.py index e982aafc7..a67464b5a 100644 --- a/processors/presets/monthly-histogram.py +++ b/processors/presets/monthly-histogram.py @@ -2,6 +2,7 @@ Extract neologisms """ from backend.lib.preset import ProcessorPreset +from common.lib.compatibility import Compatibility from processors.metrics.count_posts import CountPosts @@ -15,19 +16,9 @@ class MonthlyHistogramCreator(ProcessorPreset): description = "Create a histogram that shows the number of items over time." # description displayed in UI extension = "svg" - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - This preset is compatible with any module that has countable items (via count-posts) + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - @classmethod def get_options(cls, parent_dataset=None, config=None): count_options = CountPosts.get_options(parent_dataset=parent_dataset, config=config) diff --git a/processors/presets/similar-words.py b/processors/presets/similar-words.py index 482098ed4..5c172776e 100644 --- a/processors/presets/similar-words.py +++ b/processors/presets/similar-words.py @@ -4,6 +4,7 @@ from nltk.stem.snowball import SnowballStemmer from backend.lib.preset import ProcessorPreset +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput @@ -19,6 +20,9 @@ class SimilarWords(ProcessorPreset): "with large datasets (e.g. 100,000+ items).") extension = "csv" + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -50,19 +54,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - This preset is compatible with any module that has a "body" column - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def get_processor_pipeline(self): """ This queues a series of post-processors to calculate word similarities From 309e132bed922b8c465bf44229479431e9fe3574 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 16 Jun 2026 15:18:46 +0200 Subject: [PATCH 13/30] compatibility: multi type checks --- processors/conversion/view_metadata.py | 14 ++++---------- processors/machine_learning/audio_to_text.py | 11 ++++------- processors/machine_learning/clarifai_api.py | 14 +++----------- processors/machine_learning/google_vision_api.py | 14 +++----------- processors/visualisation/video_scene_identifier.py | 11 +++-------- 5 files changed, 17 insertions(+), 47 deletions(-) diff --git a/processors/conversion/view_metadata.py b/processors/conversion/view_metadata.py index 3a2436f3b..8d09afb44 100644 --- a/processors/conversion/view_metadata.py +++ b/processors/conversion/view_metadata.py @@ -7,6 +7,7 @@ import zipfile from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.user_input import UserInput __author__ = "Dale Wahl" @@ -27,6 +28,9 @@ class ViewMetadata(BasicProcessor): description = "Reformats the .metadata.json file and calculates analytics" # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow on downloaded media datasets + compatibility = Compatibility(type_prefixes={"video-downloader", "image-downloader"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -47,16 +51,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: }, } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type.startswith("video-downloader") or module.type.startswith("image-downloader") - def process(self): """ Grabs .metadata.json and reformats diff --git a/processors/machine_learning/audio_to_text.py b/processors/machine_learning/audio_to_text.py index 62a5e9458..6eb6b1a3a 100644 --- a/processors/machine_learning/audio_to_text.py +++ b/processors/machine_learning/audio_to_text.py @@ -7,6 +7,7 @@ from requests.exceptions import ConnectionError from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.dmi_service_manager import DmiServiceManager, DmiServiceManagerException, DsmOutOfMemory from common.lib.exceptions import ProcessorInterruptedException from common.lib.user_input import UserInput @@ -29,6 +30,9 @@ class AudioToText(BasicProcessor): " GPT models (GPT only via API).") # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI + # Allow on audio datasets + compatibility = Compatibility(media_types={"audio"}, type_prefixes={"audio-extractor"}) + followups = [] references = [ @@ -90,13 +94,6 @@ def get_queue_id(cls, remote_id, details, dataset) -> str: # Queue per model/API type return f"{cls.type}-{dataset.parameters.get('model_host', 'local')}" - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on audio archives - """ - return module.get_media_type() == 'audio' or module.type.startswith("audio-extractor") - @classmethod def get_options(cls, parent_dataset=None, config=None): """ diff --git a/processors/machine_learning/clarifai_api.py b/processors/machine_learning/clarifai_api.py index aad832e91..1b6dfaab1 100644 --- a/processors/machine_learning/clarifai_api.py +++ b/processors/machine_learning/clarifai_api.py @@ -10,6 +10,7 @@ from common.lib.helpers import UserInput, convert_to_int from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -33,7 +34,8 @@ class ClarifaiAPIFetcher(BasicProcessor): "requests will be credited by Clarifai to the owner of the API token you provide." # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - followups = ["convert-clarifai-vision-to-csv", "clarifai-bipartite-network"] + # Allow on image sets + compatibility = Compatibility(media_types={"image"}, type_prefixes={"image-downloader"}, types={"video-frames"}, preferred_followups=["convert-clarifai-vision-to-csv", "clarifai-bipartite-network"]) references = [ "[Clarifai](https://www.clarifai.com/)", @@ -41,16 +43,6 @@ class ClarifaiAPIFetcher(BasicProcessor): "[Clarifai model browser](https://clarifai.com/clarifai/main/models)" ] - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on image sets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.get_media_type() == "image" or module.type.startswith("image-downloader") or module.type == "video-frames" - @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ diff --git a/processors/machine_learning/google_vision_api.py b/processors/machine_learning/google_vision_api.py index 2491967e1..73cae0d2a 100644 --- a/processors/machine_learning/google_vision_api.py +++ b/processors/machine_learning/google_vision_api.py @@ -10,6 +10,7 @@ from common.lib.helpers import UserInput, convert_to_int from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException __author__ = "Stijn Peeters" @@ -35,23 +36,14 @@ class GoogleVisionAPIFetcher(BasicProcessor): "and Google Vision API enabled (this may take a few minutes)." # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - followups = ["convert-google-vision-to-csv", "vision-bipartite-network", "vision-label-network"] + # Allow on image sets + compatibility = Compatibility(media_types={"image"}, type_prefixes={"image-downloader"}, types={"video-frames"}, preferred_followups=["convert-google-vision-to-csv", "vision-bipartite-network", "vision-label-network"]) references = [ "[Google Vision API Documentation](https://cloud.google.com/vision/docs)", "[Google Vision API Pricing & Free Usage Limits](https://cloud.google.com/vision/pricing)" ] - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on image sets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.get_media_type() == "image" or module.type.startswith("image-downloader") or module.type == "video-frames" - @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ diff --git a/processors/visualisation/video_scene_identifier.py b/processors/visualisation/video_scene_identifier.py index c7731aca4..7fa6c03b8 100644 --- a/processors/visualisation/video_scene_identifier.py +++ b/processors/visualisation/video_scene_identifier.py @@ -10,6 +10,7 @@ from scenedetect import open_video, SceneManager, VideoOpenFailure, FrameTimecode from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException, ProcessorException from common.lib.user_input import UserInput @@ -39,7 +40,8 @@ class VideoSceneDetector(BasicProcessor): "intensity or cuts and fades to black) and extract the scene metadata." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["video-scene-frames", "video-timelines"] + # Allow on video datasets + compatibility = Compatibility(media_types={"video"}, type_prefixes={"video-downloader"}, preferred_followups=["video-scene-frames", "video-timelines"]) references = [ "[PySceneDetect](https://github.com/Breakthrough/PySceneDetect)", @@ -205,13 +207,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on videos - """ - return module.get_media_type() == "video" or module.type.startswith("video-downloader") - def process(self): """ This takes a zipped set of videos, uses https://github.com/Breakthrough/PySceneDetect to detect scene breaks in From 863f0858574b19bf03ce1ca25a0b83d93d9980ed Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 16 Jun 2026 15:44:11 +0200 Subject: [PATCH 14/30] compatibility: fix executable check (pass function) to settings check --- common/lib/compatibility.py | 13 ++++++++++++- processors/audio/audio_extractor.py | 15 +++------------ processors/presets/video-scene-timelines.py | 18 +++--------------- processors/visualisation/video_frames.py | 15 +++------------ processors/visualisation/video_scene_frames.py | 18 +++--------------- 5 files changed, 24 insertions(+), 55 deletions(-) diff --git a/common/lib/compatibility.py b/common/lib/compatibility.py index c43660af8..1ccbd1f57 100644 --- a/common/lib/compatibility.py +++ b/common/lib/compatibility.py @@ -16,7 +16,7 @@ compatibility = Compatibility( media_types={"video"}, type_prefixes={"video-downloader"}, - required_settings={("video-downloader.ffmpeg_path", shutil.which)}, + required_settings={("video-downloader.ffmpeg_path", is_executable)}, ) BasicProcessor.is_compatible_with() evaluates it. A processor whose @@ -57,6 +57,17 @@ def _maybe_call(module, method): return attr +def is_executable(path): + """ + Matcher for `required_settings`: the setting's value must point to an + executable found on the system (resolved with `shutil.which`). An unset or + empty value fails safely, e.g.:: + + required_settings={("video-downloader.ffmpeg_path", is_executable)} + """ + return bool(path) and shutil.which(path) is not None + + @dataclass class Compatibility: """ diff --git a/processors/audio/audio_extractor.py b/processors/audio/audio_extractor.py index 91c68fc7b..637798836 100644 --- a/processors/audio/audio_extractor.py +++ b/processors/audio/audio_extractor.py @@ -10,6 +10,7 @@ import oslex from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility, is_executable from common.lib.exceptions import ProcessorInterruptedException __author__ = "Dale Wahl" @@ -33,18 +34,8 @@ class AudioExtractor(BasicProcessor): extension = "zip" # extension of result file, used internally and in UI media_type = "audio" - followups = ["audio-to-text"] - - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on videos only - - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return (module.get_media_type() == "video" or module.type.startswith("video-downloader")) and \ - config.get("video-downloader.ffmpeg_path") and \ - shutil.which(config.get("video-downloader.ffmpeg_path")) + # Allow on video datasets when ffmpeg is available + compatibility = Compatibility(media_types={"video"}, type_prefixes={"video-downloader"}, required_settings={("video-downloader.ffmpeg_path", is_executable)}, preferred_followups=["audio-to-text"]) @classmethod def get_options(cls, parent_dataset=None, config=None): diff --git a/processors/presets/video-scene-timelines.py b/processors/presets/video-scene-timelines.py index 579288583..406c2a07e 100644 --- a/processors/presets/video-scene-timelines.py +++ b/processors/presets/video-scene-timelines.py @@ -1,9 +1,9 @@ """ Create scene-by-scene timelines """ -import shutil from backend.lib.preset import ProcessorPreset +from common.lib.compatibility import Compatibility, is_executable class VideoSceneTimelineCreator(ProcessorPreset): @@ -18,20 +18,8 @@ class VideoSceneTimelineCreator(ProcessorPreset): "for all videos are then stacked vertically and rendered as a single SVG file." extension = "svg" - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine compatibility - - Compatible with downloaded videos, and not really anything else! - Additionally ffmpeg needs to be available. - - :param DataSet module: Module ID to determine compatibility with - :return bool: - """ - return (module.get_media_type() == "video" or module.type.startswith("video-downloader")) and \ - config.get("video-downloader.ffmpeg_path") and \ - shutil.which(config.get("video-downloader.ffmpeg_path")) + # Allow on video datasets when ffmpeg is available + compatibility = Compatibility(media_types={"video"}, type_prefixes={"video-downloader"}, required_settings={("video-downloader.ffmpeg_path", is_executable)}) def get_processor_pipeline(self): """ diff --git a/processors/visualisation/video_frames.py b/processors/visualisation/video_frames.py index 3ce17dcca..9a24621fa 100644 --- a/processors/visualisation/video_frames.py +++ b/processors/visualisation/video_frames.py @@ -8,6 +8,7 @@ import oslex from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility, is_executable from common.lib.exceptions import ProcessorInterruptedException from common.lib.user_input import UserInput from processors.visualisation.download_videos import VideoDownloaderPlus @@ -30,7 +31,8 @@ class VideoFrames(BasicProcessor): description = "Extract frames from videos" # description displayed in UI extension = "zip" # extension of result file, used internally and in UI - followups = ["video-timelines"] + VideoDownloaderPlus.followups + # Allow on video datasets when ffmpeg is available + compatibility = Compatibility(media_types={"video"}, type_prefixes={"video-downloader"}, required_settings={("video-downloader.ffmpeg_path", is_executable)}, preferred_followups=["video-timelines"] + VideoDownloaderPlus.followups) @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: @@ -67,17 +69,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: }, } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on videos - - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return (module.get_media_type() == "video" or module.type.startswith("video-downloader")) and \ - config.get("video-downloader.ffmpeg_path") and \ - shutil.which(config.get("video-downloader.ffmpeg_path")) - def process(self): """ This takes a zipped set of videos, uses https://pypi.org/project/videohash/ and https://ffmpeg.org/ to collect diff --git a/processors/visualisation/video_scene_frames.py b/processors/visualisation/video_scene_frames.py index 7439e4da6..65e136fee 100644 --- a/processors/visualisation/video_scene_frames.py +++ b/processors/visualisation/video_scene_frames.py @@ -12,6 +12,7 @@ from packaging import version from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility, is_executable from common.lib.user_input import UserInput from common.lib.helpers import get_ffmpeg_version @@ -33,7 +34,8 @@ class VideoSceneFrames(BasicProcessor): description = "For each scene identified, extracts a key frame (e.g. the first frame)." # description displayed in UI extension = "zip" # extension of result file, used internally and in UI - followups = ["video-timelines"] + # Allow on detected video scenes when ffmpeg is available + compatibility = Compatibility(types={"video-scene-detector"}, required_settings={("video-downloader.ffmpeg_path", is_executable)}, preferred_followups=["video-timelines"]) @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: @@ -72,20 +74,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine compatibility - - Compatible with scene data only. - - :param str module: Module ID to determine compatibility with - :return bool: - """ - return module.type in ["video-scene-detector"] and \ - config.get("video-downloader.ffmpeg_path") and \ - shutil.which(config.get("video-downloader.ffmpeg_path")) - def process(self): """ This takes a zipped set of videos, uses https://pypi.org/project/videohash/ and https://ffmpeg.org/ to collect From d71f6ce95e89c1bb324662dc574744d065a3c858 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 16 Jun 2026 16:23:23 +0200 Subject: [PATCH 15/30] compatibility: figure out ffmpeg -> ffprobe connection and generalize. --- common/lib/compatibility.py | 25 +++++++++++++++++++++++++ processors/visualisation/image_wall.py | 25 +++---------------------- processors/visualisation/video_stack.py | 23 ++++------------------- processors/visualisation/video_wall.py | 23 ++++------------------- 4 files changed, 36 insertions(+), 60 deletions(-) diff --git a/common/lib/compatibility.py b/common/lib/compatibility.py index 1ccbd1f57..3311a62e3 100644 --- a/common/lib/compatibility.py +++ b/common/lib/compatibility.py @@ -68,6 +68,31 @@ def is_executable(path): return bool(path) and shutil.which(path) is not None +class ExecutableSibling: + """ + Matcher for `required_settings`: the configured executable must resolve (via + `shutil.which`) AND a sibling executable must exist next to it, found by + swapping the name in the resolved path. For tools that ship together, e.g. + ffprobe alongside ffmpeg:: + + required_settings={("video-downloader.ffmpeg_path", + ExecutableSibling("ffmpeg", "ffprobe"))} + + The matcher protocol is a one-argument callable, so arguments are passed via + the constructor; `name`/`sibling` stay readable for a future UI. None-safe. + """ + + def __init__(self, name, sibling): + self.name = name + self.sibling = sibling + + def __call__(self, path): + resolved = shutil.which(path) if path else None + if not resolved: + return False + return shutil.which(self.sibling.join(resolved.rsplit(self.name, 1))) is not None + + @dataclass class Compatibility: """ diff --git a/processors/visualisation/image_wall.py b/processors/visualisation/image_wall.py index 2438dac61..ebbb25f2b 100644 --- a/processors/visualisation/image_wall.py +++ b/processors/visualisation/image_wall.py @@ -4,8 +4,8 @@ from PIL import Image, ImageOps, UnidentifiedImageError from sklearn.cluster import KMeans from common.lib.helpers import UserInput +from common.lib.compatibility import Compatibility, ExecutableSibling import colorsys -import shutil import copy from processors.visualisation.video_wall import VideoWallGenerator @@ -30,27 +30,8 @@ class ImageWallGenerator(VideoWallGenerator): description = "Put all images in a single combined image, side by side. Images can be sorted and resized." extension = "png" - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Check compatibility - - This processor can run 1) if ffmpeg is available and 2) if the source - is an image or video dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - ffmpeg_path = shutil.which(config.get("video-downloader.ffmpeg_path")) - ffprobe_path = ( - shutil.which("ffprobe".join(ffmpeg_path.rsplit("ffmpeg", 1))) - if ffmpeg_path - else None - ) - have_ffmpeg = (ffmpeg_path and ffprobe_path) - return have_ffmpeg and (module.get_media_type() in ("video", "image") - or module.type.startswith("image-downloader") - or module.type == "video-frames") + # Allow on image/video datasets when ffmpeg and ffprobe are available + compatibility = Compatibility(media_types={"video", "image"}, type_prefixes={"image-downloader"}, types={"video-frames"}, required_settings={("video-downloader.ffmpeg_path", ExecutableSibling("ffmpeg", "ffprobe"))}) @classmethod def get_options(cls, parent_dataset=None, config=None): diff --git a/processors/visualisation/video_stack.py b/processors/visualisation/video_stack.py index 390d9f7f3..a2f2fcac5 100644 --- a/processors/visualisation/video_stack.py +++ b/processors/visualisation/video_stack.py @@ -12,6 +12,7 @@ from packaging import version from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility, ExecutableSibling from common.lib.exceptions import ProcessorInterruptedException from common.lib.user_input import UserInput from common.lib.helpers import get_ffmpeg_version @@ -36,6 +37,9 @@ class VideoStack(BasicProcessor): "videos. Videos are stacked by length, i.e. the longest video is at the 'bottom' of the stack." # description displayed in UI extension = "mp4" # extension of result file, used internally and in UI + # Allow on video datasets when ffmpeg and ffprobe are available + compatibility = Compatibility(media_types={"video"}, type_prefixes={"video-downloader"}, required_settings={("video-downloader.ffmpeg_path", ExecutableSibling("ffmpeg", "ffprobe"))}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -98,25 +102,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine compatibility - - :param DataSet module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - if not (module.get_media_type() == "video" or module.type.startswith("video-downloader")): - return False - else: - # Only check these if we have a video dataset - # also need ffprobe to determine video lengths - # is usually installed in same place as ffmpeg - ffmpeg_path = shutil.which(config.get("video-downloader.ffmpeg_path")) - ffprobe_path = shutil.which("ffprobe".join(ffmpeg_path.rsplit("ffmpeg", 1))) if ffmpeg_path else None - return ffmpeg_path and ffprobe_path - def process(self): """ This takes a zipped set of videos, uses https://pypi.org/project/videohash/ and https://ffmpeg.org/ to collect diff --git a/processors/visualisation/video_wall.py b/processors/visualisation/video_wall.py index fb4f54946..74800834c 100644 --- a/processors/visualisation/video_wall.py +++ b/processors/visualisation/video_wall.py @@ -12,6 +12,7 @@ from common.lib.helpers import UserInput, get_ffmpeg_version, convert_to_int from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility, ExecutableSibling from common.lib.exceptions import ProcessorInterruptedException, MediaSignatureException __author__ = "Stijn Peeters" @@ -34,6 +35,9 @@ class VideoWallGenerator(BasicProcessor): description = "Put all videos in a single combined video, side by side. Videos can be sorted and resized." extension = "mp4" # extension of result file, used internally and in UI + # Allow on video datasets when ffmpeg and ffprobe are available + compatibility = Compatibility(media_types={"video"}, type_prefixes={"video-downloader"}, required_settings={("video-downloader.ffmpeg_path", ExecutableSibling("ffmpeg", "ffprobe"))}) + # videos will be arranged and resized to fit these image wall dimensions # note that video aspect ratio may not allow for a precise fit TARGET_DIMENSIONS = { @@ -128,25 +132,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine compatibility - - :param DataSet module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - if not (module.get_media_type() == "video" or module.type.startswith("video-downloader")): - return False - else: - # Only check these if we have a video dataset - # also need ffprobe to determine video lengths - # is usually installed in same place as ffmpeg - ffmpeg_path = shutil.which(config.get("video-downloader.ffmpeg_path")) - ffprobe_path = shutil.which("ffprobe".join(ffmpeg_path.rsplit("ffmpeg", 1))) if ffmpeg_path else None - return ffmpeg_path and ffprobe_path - def process(self): """ Go through media files, determine dimensions, sort according to the From f8d64086cf100b4427aa55f1799e1869311cc0de Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 16 Jun 2026 16:39:18 +0200 Subject: [PATCH 16/30] compatibility: add a short circuit! do not check every requirement. and leave expensive last --- common/lib/compatibility.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/common/lib/compatibility.py b/common/lib/compatibility.py index 3311a62e3..83a7379f4 100644 --- a/common/lib/compatibility.py +++ b/common/lib/compatibility.py @@ -57,6 +57,10 @@ def _maybe_call(module, method): return attr +# TODO: memoize shutil.which() (used by is_executable / ExecutableSibling) -- its +# result is constant per process, so a cached wrapper (positive results only, to +# avoid stale negatives if an executable is installed without a restart) would +# avoid repeated $PATH scans on video-heavy pages. def is_executable(path): """ Matcher for `required_settings`: the setting's value must point to an @@ -154,13 +158,19 @@ def is_compatible_with(self, module, config=None) -> bool: """ return not self.unmet_requirements(module, config=config) - def unmet_requirements(self, module, config=None) -> List[str]: + def unmet_requirements(self, module, config=None, first_only=True) -> List[str]: """ Return the requirements `module` does not meet, as readable strings. An empty list means `module` is compatible. Each string names one thing that is missing -- a wrong dataset type, an absent column, a setting that is not configured, and so on. + + By default the method returns as soon as one requirement is unmet -- + enough for the yes/no `is_compatible_with`, and it skips the expensive + environment checks (config reads, shutil.which) whenever a cheap shape + check already fails. Pass `first_only=False` to collect every unmet + requirement -- used to explain why a module is not compatible. """ reasons: List[str] = [] if module is None: @@ -170,14 +180,20 @@ def unmet_requirements(self, module, config=None) -> List[str]: # must be one of them if self._identity_declared() and not self._identity_matches(module): reasons.append("dataset type/media is not accepted") + if first_only: + return reasons if self.top_dataset_only and not _maybe_call(module, "is_top_dataset"): reasons.append("requires a top-level dataset") + if first_only: + return reasons if self.extensions is not None: extension = _maybe_call(module, "get_extension") if extension not in set(self.extensions): reasons.append("requires extension: %s" % ", ".join(self.extensions)) + if first_only: + return reasons if self.rankable is not None: if bool(_maybe_call(module, "is_rankable")) != self.rankable: @@ -185,6 +201,8 @@ def unmet_requirements(self, module, config=None) -> List[str]: "requires a rankable dataset" if self.rankable else "requires a non-rankable dataset" ) + if first_only: + return reasons # the only check that really needs a DataSet object if self.requires_columns: @@ -192,7 +210,11 @@ def unmet_requirements(self, module, config=None) -> List[str]: missing = [column for column in self.requires_columns if column not in columns] if missing: reasons.append("requires column(s): %s" % ", ".join(missing)) + if first_only: + return reasons + # Note: this may have executable check that are expensive so we do it after cheaper checks + # TODO: could separate quicker setting checks for slower ones for requirement in self.required_settings: key, expected = (requirement, None) if isinstance(requirement, str) else requirement value = config.get(key) if config is not None else None @@ -210,10 +232,14 @@ def unmet_requirements(self, module, config=None) -> List[str]: met = value == expected if not met: reasons.append("requires setting: %s" % key) + if first_only: + return reasons for package in self.required_packages: if not shutil.which(package): reasons.append("requires package: %s" % package) + if first_only: + return reasons return reasons From 6f8e60582addce5c9d148807aef25b2c47916d73 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 17 Jun 2026 09:05:40 +0200 Subject: [PATCH 17/30] compatibility datasources --- processors/conversion/split_by_thread.py | 12 +++--------- processors/metrics/debate_metrics.py | 14 +++----------- processors/metrics/most_quoted.py | 15 +++------------ processors/networks/quote_network.py | 12 +++--------- 4 files changed, 12 insertions(+), 41 deletions(-) diff --git a/processors/conversion/split_by_thread.py b/processors/conversion/split_by_thread.py index 87027ff64..a53f8e55e 100644 --- a/processors/conversion/split_by_thread.py +++ b/processors/conversion/split_by_thread.py @@ -4,6 +4,7 @@ import csv from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -25,16 +26,9 @@ class ThreadSplitter(BasicProcessor): description = "Split the dataset per thread. The result is a ZIP archive containing separate CSV files." # description displayed in UI extension = "zip" # extension of result file, used internally and in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset + # datasets with a thread structure (4chan/8chan, reddit, breitbart) + compatibility = Compatibility(datasources={"fourchan", "eightchan", "reddit", "breitbart"}) - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.parameters.get("datasource") in ("4chan", "8chan", "reddit", "breitbart") - def process(self): """ This takes a 4CAT results file as input, and outputs a new CSV file diff --git a/processors/metrics/debate_metrics.py b/processors/metrics/debate_metrics.py index 4e977fdd9..d784fb848 100644 --- a/processors/metrics/debate_metrics.py +++ b/processors/metrics/debate_metrics.py @@ -5,6 +5,7 @@ import time from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Sal Hagen" __credits__ = ["Sal Hagen"] @@ -30,17 +31,8 @@ class DebateMetrics(BasicProcessor): description = "Returns a csv with meta-metrics per thread." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = [] - - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor if dataset is a 'top level' dataset - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.parameters.get("datasource") in ("fourchan", "eightchan", "eightkun") + # chan datasets (thread-level debate metrics) + compatibility = Compatibility(datasources={"fourchan", "eightchan", "eightkun"}) def process(self): """ diff --git a/processors/metrics/most_quoted.py b/processors/metrics/most_quoted.py index c8a83982b..6bb49fe4b 100644 --- a/processors/metrics/most_quoted.py +++ b/processors/metrics/most_quoted.py @@ -5,6 +5,7 @@ import re from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -23,18 +24,8 @@ class QuoteRanker(BasicProcessor): description = "Sort posts by how often they were replied to by other posts in the dataset." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = [] - - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on chan datasets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.parameters.get("datasource") in ("fourchan", "eightchan", "eightkun") - + # chan datasets (posts reply to / quote each other) + compatibility = Compatibility(datasources={"fourchan", "eightchan", "eightkun"}) def process(self): """ diff --git a/processors/networks/quote_network.py b/processors/networks/quote_network.py index 0ad478fa7..98ebf2979 100644 --- a/processors/networks/quote_network.py +++ b/processors/networks/quote_network.py @@ -4,6 +4,7 @@ import re from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility import networkx as nx @@ -25,16 +26,9 @@ class QuoteNetworkGrapher(BasicProcessor): "Each reference to another post creates an edge between posts. " # description displayed in UI extension = "gexf" # extension of result file, used internally and in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on chan datasets + # chan datasets (posts reply to / quote each other) + compatibility = Compatibility(datasources={"fourchan", "eightchan", "eightkun"}) - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.parameters.get("datasource") in ("fourchan", "eightchan", "eightkun") - def process(self): """ This takes a 4CAT results file as input, and outputs a new CSV file From 6ee0600f131a5361cbd956dd31521bd33ed1aba7 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 17 Jun 2026 09:22:38 +0200 Subject: [PATCH 18/30] compatibility: add is_rankable and handle ranking multiple items True/False --- common/lib/compatibility.py | 12 ++++++++---- processors/visualisation/isoviz.py | 16 ++++------------ processors/visualisation/rankflow.py | 14 ++++---------- processors/visualisation/vector_histogram.py | 14 ++++---------- 4 files changed, 20 insertions(+), 36 deletions(-) diff --git a/common/lib/compatibility.py b/common/lib/compatibility.py index 83a7379f4..7a858b06f 100644 --- a/common/lib/compatibility.py +++ b/common/lib/compatibility.py @@ -37,21 +37,22 @@ from typing import Iterable, List, Optional -def _maybe_call(module, method): +def _maybe_call(module, method, **kwargs): """ Read `module.method` without assuming it exists. Calls it and returns the result when it is a method, returns the value when it is a plain attribute, and returns None when it is missing or raises. A DataSet exposes these as methods; a processor class exposes some of them as - well, and this keeps the same check working for both. + well, and this keeps the same check working for both. Any keyword arguments + are forwarded to the call (e.g. is_rankable(multiple_items=False)). """ attr = getattr(module, method, None) if attr is None: return None if callable(attr): try: - return attr() + return attr(**kwargs) except Exception: return None return attr @@ -128,6 +129,9 @@ class Compatibility: top_dataset_only: bool = False # When set, the dataset's is_rankable() must equal this. None means it does not matter. rankable: Optional[bool] = None + # Forwarded to is_rankable(multiple_items=...) when `rankable` is set. False + # restricts to single-value rankings (rejecting multi-column word_1/word_2/... rankings). + rankable_multiple_items: bool = True # Columns that must all be present in the dataset. This can only be checked # against a real dataset, as it reads the dataset's columns. requires_columns: Iterable[str] = () @@ -196,7 +200,7 @@ def unmet_requirements(self, module, config=None, first_only=True) -> List[str]: return reasons if self.rankable is not None: - if bool(_maybe_call(module, "is_rankable")) != self.rankable: + if bool(_maybe_call(module, "is_rankable", multiple_items=self.rankable_multiple_items)) != self.rankable: reasons.append( "requires a rankable dataset" if self.rankable else "requires a non-rankable dataset" diff --git a/processors/visualisation/isoviz.py b/processors/visualisation/isoviz.py index 697e21bfc..a1199abda 100644 --- a/processors/visualisation/isoviz.py +++ b/processors/visualisation/isoviz.py @@ -6,6 +6,7 @@ from backend.lib.processor import BasicProcessor from common.lib.helpers import UserInput, convert_to_int, pad_interval, get_4cat_canvas +from common.lib.compatibility import Compatibility from calendar import month_abbr from math import sin, cos, tan, degrees, radians, copysign @@ -35,6 +36,9 @@ class IsometricMultigraphRenderer(BasicProcessor): description = "Generate area graphs showing prevalence per item over time. These are visualised side-by-side on an isometric plane for easy comparison." # description displayed in UI extension = "svg" # extension of result file, used internally and in UI + # rankable datasets with a single value per item (multiple_items=False) + compatibility = Compatibility(rankable=True, rankable_multiple_items=False) + # a palette generated with https://medialab.github.io/iwanthue/ colours = ["#eb010a", "#495dff", "#f35f00", "#5137e0", "#ffeb45", "#d05edf", "#00cb3a", "#b200c7", "#d8fd5d", "#a058ff", "#b90fd4", "#6fb300", @@ -85,18 +89,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on rankable items - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - if module.is_dataset(): - return module.is_rankable(multiple_items=False) - return False - def process(self): graphs = {} intervals = [] diff --git a/processors/visualisation/rankflow.py b/processors/visualisation/rankflow.py index a8ad588ab..99397321d 100644 --- a/processors/visualisation/rankflow.py +++ b/processors/visualisation/rankflow.py @@ -7,6 +7,7 @@ from backend.lib.processor import BasicProcessor from common.lib.helpers import UserInput, get_4cat_canvas from common.lib.exceptions import ProcessorInterruptedException +from common.lib.compatibility import Compatibility from svgwrite.shapes import Rect from svgwrite.path import Path @@ -41,6 +42,9 @@ class RankFlowRenderer(BasicProcessor): ) # description displayed in UI extension = "svg" # extension of result file, used internally and in UI + # rankable datasets, including multi-column rankings (e.g. top vectors per interval) + compatibility = Compatibility(rankable=True) + references = [ "[Rieder, B. RankFlow. *The Politics of Systems*](https://labs.polsys.net/tools/rankflow/)" ] @@ -133,16 +137,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: }, } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on rankable items - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_rankable() - def process(self): """ Render RankFlow diagram diff --git a/processors/visualisation/vector_histogram.py b/processors/visualisation/vector_histogram.py index 725960e08..e01f2ba22 100644 --- a/processors/visualisation/vector_histogram.py +++ b/processors/visualisation/vector_histogram.py @@ -12,6 +12,7 @@ from backend.lib.processor import BasicProcessor from common.lib.helpers import UserInput, pad_interval, get_4cat_canvas +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -28,6 +29,9 @@ class SVGHistogramRenderer(BasicProcessor): description = "Generates a histogram from time frequencies." # description displayed in UI extension = "svg" + # rankable datasets with a single value per item (multiple_items=False) + compatibility = Compatibility(rankable=True, rankable_multiple_items=False) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -48,16 +52,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on rankable items - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_rankable(multiple_items=False) - def process(self): """ Render an SVG histogram/bar chart using a previous frequency analysis From 5384e92b1af31cfcc664b505879d4776d5c3ca94 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 17 Jun 2026 09:27:01 +0200 Subject: [PATCH 19/30] compatibility media_types --- processors/conversion/hash_images.py | 14 ++++---------- processors/filtering/unique_images.py | 14 ++++---------- processors/visualisation/word-trees.py | 14 ++++---------- 3 files changed, 12 insertions(+), 30 deletions(-) diff --git a/processors/conversion/hash_images.py b/processors/conversion/hash_images.py index 25a6eb4de..13b0414ca 100644 --- a/processors/conversion/hash_images.py +++ b/processors/conversion/hash_images.py @@ -9,6 +9,7 @@ from backend.lib.processor import BasicProcessor from common.lib.exceptions import ProcessorInterruptedException from common.lib.helpers import UserInput, hash_image, stringify_hash +from common.lib.compatibility import Compatibility from processors.metrics.group_hashes import HashGrouper @@ -28,6 +29,9 @@ class ImageHasher(BasicProcessor): description = "Convert images to text hashes for comparison and similarity detection." # description displayed in UI extension = "csv" + # image datasets: image archives, image-downloader output, or extracted video frames + compatibility = Compatibility(media_types={"image"}, type_prefixes={"image-downloader"}, types={"video-frames"}) + references = [ "[Imagehash library](https://github.com/JohannesBuchner/imagehash?tab=readme-ov-file)", "Explainer: [Perceptual hashing](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)", @@ -94,16 +98,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on image archives - - :param module: Module to determine compatibility with - """ - return module.get_media_type() == "image" or module.type.startswith( - "image-downloader") or module.type == "video-frames" - def process(self): """ Loop through images and hashing them diff --git a/processors/filtering/unique_images.py b/processors/filtering/unique_images.py index 482ad3ceb..8f5b448eb 100644 --- a/processors/filtering/unique_images.py +++ b/processors/filtering/unique_images.py @@ -7,6 +7,7 @@ from backend.lib.processor import BasicProcessor from common.lib.exceptions import ProcessorInterruptedException from common.lib.helpers import UserInput, hash_file +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -24,6 +25,9 @@ class UniqueImageFilter(BasicProcessor): description = "Only keeps one instance per image using various detection methods." # description displayed in UI extension = "zip" + # image datasets: image archives, image-downloader output, or extracted video frames + compatibility = Compatibility(media_types={"image"}, type_prefixes={"image-downloader"}, types={"video-frames"}) + references = [ "[Imagehash library](https://github.com/JohannesBuchner/imagehash?tab=readme-ov-file)", "Explainer: [Average hash](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)", @@ -58,16 +62,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on image archives - - :param module: Module to determine compatibility with - """ - return module.get_media_type() == "image" or module.type.startswith( - "image-downloader") or module.type == "video-frames" - def process(self): """ Loop through images and only retain ones that have not been seen yet diff --git a/processors/visualisation/word-trees.py b/processors/visualisation/word-trees.py index e64e96410..0bbaf602a 100644 --- a/processors/visualisation/word-trees.py +++ b/processors/visualisation/word-trees.py @@ -8,6 +8,7 @@ from backend.lib.processor import BasicProcessor from common.lib.helpers import UserInput, convert_to_int, get_4cat_canvas from common.lib.exceptions import QueryParametersException +from common.lib.compatibility import Compatibility from nltk.tokenize import word_tokenize @@ -168,6 +169,9 @@ class MakeWordtree(BasicProcessor): description = "Generates a word tree for a given query, a \"graphical version of the traditional 'keyword-in-context' method\" (Wattenberg & Viégas, 2008)." # description displayed in UI extension = "svg" # extension of result file, used internally and in UI + # any csv or ndjson dataset + compatibility = Compatibility(extensions={"csv", "ndjson"}) + references = [ "Wattenberg, M., & Viégas, F. B. (2008). [The Word Tree, an Interactive Visual Concordance](https://doi.org/10.1109/TVCG.2008.172). IEEE Transactions on Visualization and Computer Graphics, 14(6), 1221–1228." ] @@ -307,16 +311,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on all csv and NDJSON datasets - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.get_extension() in ("csv", "ndjson") - def process(self): """ This takes a 4CAT results file as input, and outputs a plain text file From 27c8e2fd5b39c7910ada041131aba0b1fca3b646 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 17 Jun 2026 09:27:25 +0200 Subject: [PATCH 20/30] compatibility: required_settings --- processors/conversion/tcat_auto_upload.py | 21 ++++--------------- .../machine_learning/blip2_image_caption.py | 14 +++---------- .../clip_categorize_images.py | 13 +++--------- processors/machine_learning/pix-plot.py | 16 +++----------- .../machine_learning/text_from_image.py | 16 +++----------- processors/presets/upload-to-dmi-tcat.py | 18 ++++------------ 6 files changed, 20 insertions(+), 78 deletions(-) diff --git a/processors/conversion/tcat_auto_upload.py b/processors/conversion/tcat_auto_upload.py index 2a72d610e..afdc2d521 100644 --- a/processors/conversion/tcat_auto_upload.py +++ b/processors/conversion/tcat_auto_upload.py @@ -9,6 +9,7 @@ from backend.lib.processor import BasicProcessor from common.lib.user_input import UserInput from common.lib.helpers import get_last_line +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl", "Stijn Peeters"] @@ -27,6 +28,9 @@ class FourcatToDmiTcatUploader(BasicProcessor): description = "Send a TCAT-ready JSON file to a particular DMI-TCAT server." # description displayed in UI extension = "html" # extension of result file, used internally and in UI + # the TCAT converter's output, when a TCAT server is configured + compatibility = Compatibility(types={"convert-ndjson-for-tcat"}, required_settings={"tcat-auto-upload.server_url", "tcat-auto-upload.token", "tcat-auto-upload.username", "tcat-auto-upload.password"}) + config = { # TCAT Server Connection Info 'tcat-auto-upload.server_url': { @@ -59,23 +63,6 @@ class FourcatToDmiTcatUploader(BasicProcessor): }, } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - It is if TCAT credentials have been configured and the input is a - TCAT-compatible file. - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "convert-ndjson-for-tcat" and \ - config.get('tcat-auto-upload.server_url') and \ - config.get('tcat-auto-upload.token') and \ - config.get('tcat-auto-upload.username') and \ - config.get('tcat-auto-upload.password') - @classmethod def get_options(cls, parent_dataset=None, config=None): """ diff --git a/processors/machine_learning/blip2_image_caption.py b/processors/machine_learning/blip2_image_caption.py index c515085d9..109d811d2 100644 --- a/processors/machine_learning/blip2_image_caption.py +++ b/processors/machine_learning/blip2_image_caption.py @@ -9,6 +9,7 @@ from common.lib.exceptions import ProcessorInterruptedException from common.lib.user_input import UserInput from common.lib.item_mapping import MappedItem +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -26,8 +27,8 @@ class CategorizeImagesCLIP(BasicProcessor): description = "The BLIP2 model uses a pretrained image encoder combined with an LLM to generate image captions. The model can also be prompted and uses the image plus prompt to generate text responses." # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - # Processors designed to handle input from this Dataset - followups = ["image-text-wall"] + # image datasets (image archives or image-downloader output), when BLIP2 is enabled + compatibility = Compatibility(media_types={"image"}, type_prefixes={"image-downloader"}, required_settings={"dmi-service-manager.fc_blip2_enabled", "dmi-service-manager.ab_server_address"}, preferred_followups=["image-text-wall"]) references = [ "[OpenAI CLIP blog](https://openai.com/research/clip)", @@ -67,15 +68,6 @@ def get_queue_id(cls, remote_id, details, dataset) -> str: # Unique queue for locally hosted models; used by other local model processors as well return "local_models" - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on image archives if enabled in Control Panel - """ - return config.get("dmi-service-manager.fc_blip2_enabled", False) and \ - config.get("dmi-service-manager.ab_server_address", False) and \ - (module.get_media_type() == "image" or module.type.startswith("image-downloader")) - @classmethod def get_options(cls, parent_dataset=None, config=None): """ diff --git a/processors/machine_learning/clip_categorize_images.py b/processors/machine_learning/clip_categorize_images.py index 30d29e463..99aa084d1 100644 --- a/processors/machine_learning/clip_categorize_images.py +++ b/processors/machine_learning/clip_categorize_images.py @@ -10,6 +10,7 @@ from common.lib.exceptions import ProcessorInterruptedException from common.lib.user_input import UserInput from common.lib.item_mapping import MappedItem +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -28,7 +29,8 @@ class CategorizeImagesCLIP(BasicProcessor): "the likelihood an image belongs to a category (total of all category values will be 100%).") # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - followups = ["image-category-wall"] + # image datasets (image archives or image-downloader output), when CLIP is enabled + compatibility = Compatibility(media_types={"image"}, type_prefixes={"image-downloader"}, required_settings={"dmi-service-manager.cc_clip_enabled", "dmi-service-manager.ab_server_address"}, preferred_followups=["image-category-wall"]) references = [ "[OpenAI CLIP blog](https://openai.com/research/clip)", @@ -69,15 +71,6 @@ def get_queue_id(cls, remote_id, details, dataset) -> str: # Unique queue for locally hosted models; used by other local model processors as well return "local_models" - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on image archives if enabled in Control Panel - """ - return config.get("dmi-service-manager.cc_clip_enabled", False) and \ - config.get("dmi-service-manager.ab_server_address", False) and \ - (module.get_media_type() == "image" or module.type.startswith("image-downloader")) - @classmethod def get_options(cls, parent_dataset=None, config=None): """ diff --git a/processors/machine_learning/pix-plot.py b/processors/machine_learning/pix-plot.py index 4668ff62f..7d0a6b991 100644 --- a/processors/machine_learning/pix-plot.py +++ b/processors/machine_learning/pix-plot.py @@ -13,6 +13,7 @@ from common.lib.dmi_service_manager import DmiServiceManager, DsmOutOfMemory, DmiServiceManagerException from common.lib.helpers import UserInput, ellipsiate from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -33,7 +34,8 @@ class PixPlotGenerator(BasicProcessor): "algorithmically grouped by similarity." extension = "html" # extension of result file, used internally and in UI - followups = [] + # image datasets (image archives or image-downloader output), when PixPlot is enabled + compatibility = Compatibility(media_types={"image"}, type_prefixes={"image-downloader"}, required_settings={"dmi-service-manager.db_pixplot_enabled", "dmi-service-manager.ab_server_address"}) references = [ "[PixPlot](https://pixplot.io/)", @@ -145,18 +147,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on token sets; - Checks if pix-plot.server_url set - - :param module: Dataset or processor to determine compatibility with - """ - return config.get("dmi-service-manager.db_pixplot_enabled", False) and \ - config.get("dmi-service-manager.ab_server_address", False) and \ - (module.get_media_type() == "image" or module.type.startswith("image-downloader")) - def process(self): """ This takes a 4CAT results file as input, copies the images to a temp diff --git a/processors/machine_learning/text_from_image.py b/processors/machine_learning/text_from_image.py index 8dc7efe9a..0edce68a9 100644 --- a/processors/machine_learning/text_from_image.py +++ b/processors/machine_learning/text_from_image.py @@ -13,6 +13,7 @@ from backend.lib.processor import BasicProcessor from common.lib.exceptions import ProcessorInterruptedException from common.lib.item_mapping import MappedItem +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -37,8 +38,8 @@ class ImageTextDetector(BasicProcessor): """ extension = "ndjson" # extension of result file, used internally and in UI - # Processors designed to handle input from this Dataset - followups = ["image-text-wall"] + # image datasets (image archives or image-downloader output), when the OCR server is enabled + compatibility = Compatibility(media_types={"image"}, type_prefixes={"image-downloader"}, required_settings={"dmi-service-manager.eb_ocr_enabled", "dmi-service-manager.ab_server_address"}, preferred_followups=["image-text-wall"]) references = [ "[DMI OCR Server](https://github.com/digitalmethodsinitiative/ocr_server#readme)", @@ -102,17 +103,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: # }, } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on image sets - - :param module: Module to determine compatibility with - """ - return config.get('dmi-service-manager.eb_ocr_enabled', False) and \ - config.get("dmi-service-manager.ab_server_address", False) and \ - (module.get_media_type() == "image" or module.type.startswith("image-downloader")) - def process(self): """ This takes a 4CAT zip file of images, and outputs a NDJSON file with the diff --git a/processors/presets/upload-to-dmi-tcat.py b/processors/presets/upload-to-dmi-tcat.py index 74f039c6d..7e6004415 100644 --- a/processors/presets/upload-to-dmi-tcat.py +++ b/processors/presets/upload-to-dmi-tcat.py @@ -3,6 +3,7 @@ """ from backend.lib.preset import ProcessorPreset from common.lib.helpers import UserInput +from common.lib.compatibility import Compatibility class FourcatToDmiTcatConverterAndUploader(ProcessorPreset): """ @@ -14,6 +15,9 @@ class FourcatToDmiTcatConverterAndUploader(ProcessorPreset): description = "Convert the dataset to a TCAT-compatible format and upload it to an available TCAT server." # description displayed in UI extension = "html" + # Twitter v2 search results, when a TCAT server is configured + compatibility = Compatibility(types={"twitterv2-search"}, required_settings={"tcat-auto-upload.server_url", "tcat-auto-upload.token", "tcat-auto-upload.username", "tcat-auto-upload.password"}) + @classmethod def get_options(cls, parent_dataset=None, config=None): """ @@ -48,20 +52,6 @@ def get_options(cls, parent_dataset=None, config=None): else: return {} - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "twitterv2-search" and \ - config.get('tcat-auto-upload.server_url') and \ - config.get('tcat-auto-upload.token') and \ - config.get('tcat-auto-upload.username') and \ - config.get('tcat-auto-upload.password') - def get_processor_pipeline(self): """ This queues a series of post-processors to upload a dataset to a From e18f264d5c1ab4b875c72dadd573268eb686aa5b Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 17 Jun 2026 11:07:28 +0200 Subject: [PATCH 21/30] compatibility: clarify the dataset-required separation and make a helper --- common/lib/compatibility.py | 53 ++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 12 deletions(-) diff --git a/common/lib/compatibility.py b/common/lib/compatibility.py index 7a858b06f..9fad2bfcc 100644 --- a/common/lib/compatibility.py +++ b/common/lib/compatibility.py @@ -122,18 +122,20 @@ class Compatibility: # Datasources the processor accepts, e.g. {"4chan", "reddit"}. datasources: Optional[Iterable[str]] = None - # --- structural gates (each must hold when set) --- + # --- shape gates (no dataset needed; answerable from a processor class too) --- # Result-file extensions the processor accepts, e.g. {"csv", "ndjson"}. extensions: Optional[Iterable[str]] = None # When True, the processor only accepts a top-level dataset (one with no parent). top_dataset_only: bool = False - # When set, the dataset's is_rankable() must equal this. None means it does not matter. + + # --- dataset-required gates (read from the result file, so they need a real + # dataset and cannot be resolved from a processor class -- see requires_dataset) --- + # When set, is_rankable() must equal this (read from the result file). None = not checked. rankable: Optional[bool] = None # Forwarded to is_rankable(multiple_items=...) when `rankable` is set. False # restricts to single-value rankings (rejecting multi-column word_1/word_2/... rankings). rankable_multiple_items: bool = True - # Columns that must all be present in the dataset. This can only be checked - # against a real dataset, as it reads the dataset's columns. + # Columns that must all be present in the dataset, read from its columns. requires_columns: Iterable[str] = () # --- environment requirements --- @@ -153,6 +155,19 @@ class Compatibility: # when they are otherwise compatible. excluded_followups: Iterable[str] = () + @property + def requires_dataset(self) -> bool: + """ + Whether fully evaluating this spec needs a materialized DataSet. + + True when the `rankable` or `requires_columns` axis is set: both are read + from the produced result file, so they cannot be resolved from a + processor class alone. A consumer that reasons about processors without + real datasets (e.g. a processor map) can use this to mark those axes as + undecided rather than treating them as failed. + """ + return self.rankable is not None or bool(self.requires_columns) + def is_compatible_with(self, module, config=None) -> bool: """ Return whether `module` meets every requirement in this specification. @@ -170,16 +185,27 @@ def unmet_requirements(self, module, config=None, first_only=True) -> List[str]: that is missing -- a wrong dataset type, an absent column, a setting that is not configured, and so on. + The checks run in three tiers, cheapest first so the short-circuit can + skip later work once something fails: + + 1. shape -- type/extension/etc.; answerable from a processor class as + well as a real dataset (no file or system access); + 2. dataset-required -- `rankable` and `requires_columns`, read from the + produced result file, so they need a materialized DataSet (see + `requires_dataset`); + 3. environment -- configuration settings and system executables. + By default the method returns as soon as one requirement is unmet -- - enough for the yes/no `is_compatible_with`, and it skips the expensive - environment checks (config reads, shutil.which) whenever a cheap shape - check already fails. Pass `first_only=False` to collect every unmet - requirement -- used to explain why a module is not compatible. + enough for the yes/no `is_compatible_with`. Pass `first_only=False` to + collect every unmet requirement -- used to explain why a module is not + compatible. """ reasons: List[str] = [] if module is None: return ["no dataset provided"] + # --- tier 1: shape (no DataSet needed; also answerable from a processor class) --- + # if the processor names the kinds of dataset it accepts, the module # must be one of them if self._identity_declared() and not self._identity_matches(module): @@ -199,6 +225,9 @@ def unmet_requirements(self, module, config=None, first_only=True) -> List[str]: if first_only: return reasons + # --- tier 2: dataset-required (read from the result file; cannot be + # resolved from a processor class -- see requires_dataset) --- + if self.rankable is not None: if bool(_maybe_call(module, "is_rankable", multiple_items=self.rankable_multiple_items)) != self.rankable: reasons.append( @@ -208,7 +237,6 @@ def unmet_requirements(self, module, config=None, first_only=True) -> List[str]: if first_only: return reasons - # the only check that really needs a DataSet object if self.requires_columns: columns = _maybe_call(module, "get_columns") or [] missing = [column for column in self.requires_columns if column not in columns] @@ -217,15 +245,16 @@ def unmet_requirements(self, module, config=None, first_only=True) -> List[str]: if first_only: return reasons - # Note: this may have executable check that are expensive so we do it after cheaper checks - # TODO: could separate quicker setting checks for slower ones + # --- tier 3: environment (needs config/system, not a DataSet; the + # executable matchers here can be expensive, so this tier runs last. + # TODO: cheap setting reads could be split out ahead of those matchers) --- for requirement in self.required_settings: key, expected = (requirement, None) if isinstance(requirement, str) else requirement value = config.get(key) if config is not None else None # no expected value; just check that the setting is truthy if expected is None: met = bool(value) - # some function to check (special check for things like {"video-downloader.ffmpeg_path": lambda p: shutil.which(p) is not None}) + # a function that validates the value (e.g. is_executable / ExecutableSibling) elif callable(expected): met = bool(expected(value)) # a collection of acceptable values From 345dd88c2dd53bef6608489c2769738569f9a744 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 17 Jun 2026 13:25:59 +0200 Subject: [PATCH 22/30] compatibility: excluded_types, is_collector, child_only axes --- common/lib/compatibility.py | 76 ++++++++++++++++++------- processors/conversion/extract_urls.py | 13 ++--- processors/conversion/merge_datasets.py | 12 +--- processors/filtering/column_filter.py | 24 ++------ processors/metrics/top_images.py | 15 +---- processors/metrics/youtube_metadata.py | 16 +----- 6 files changed, 75 insertions(+), 81 deletions(-) diff --git a/common/lib/compatibility.py b/common/lib/compatibility.py index 9fad2bfcc..b4c9adc06 100644 --- a/common/lib/compatibility.py +++ b/common/lib/compatibility.py @@ -112,7 +112,7 @@ class Compatibility: AND-ed). """ - # --- consumed data shape: identity (the module must match one of these) --- + # --- Identity axes: consumed data shape (the module must match one of these) --- # Dataset types the processor accepts, matched exactly. types: Optional[Iterable[str]] = None # Dataset type prefixes the processor accepts, matched with str.startswith. @@ -121,14 +121,31 @@ class Compatibility: media_types: Optional[Iterable[str]] = None # Datasources the processor accepts, e.g. {"4chan", "reddit"}. datasources: Optional[Iterable[str]] = None - - # --- shape gates (no dataset needed; answerable from a processor class too) --- - # Result-file extensions the processor accepts, e.g. {"csv", "ndjson"}. - extensions: Optional[Iterable[str]] = None + # Collectors types (a dataset whose type ends in -search or -import) + # Compare with top_dataset_only, which reads key_parent; the two cover nearly + # the same datasets but differ in role (identity/OR vs gate/AND) + is_collector: bool = False + + # --- Shape gates (structural checks) --- + # Parent dataset types this processor CANNOT run on -- a hard gate + # (is_compatible_with returns False). Use when the processor would fail or + # produce garbage on that type (e.g. download_videos on telegram-search). + # For a soft filter use excluded_followups on the producer instead. + excluded_types: Iterable[str] = () + + # --- DataSet required gates --- + # The ground truth requires an existing DataSet to read its produced data + # TODO: processors could declare these attributes more explicitly # When True, the processor only accepts a top-level dataset (one with no parent). top_dataset_only: bool = False + # When True, the processor only accepts a non-top-level (child) dataset -- the + # inverse of top_dataset_only. + child_only: bool = False + # Result-file extensions the processor accepts, e.g. {"csv", "ndjson"}. + extensions: Optional[Iterable[str]] = None - # --- dataset-required gates (read from the result file, so they need a real + # --- Result file gates (reading the actual file is required) + # TODO: processors again could point to these attributes more explicitly if known # dataset and cannot be resolved from a processor class -- see requires_dataset) --- # When set, is_rankable() must equal this (read from the result file). None = not checked. rankable: Optional[bool] = None @@ -138,7 +155,7 @@ class Compatibility: # Columns that must all be present in the dataset, read from its columns. requires_columns: Iterable[str] = () - # --- environment requirements --- + # --- Environment requirements --- # Executables that must be found on the system path (checked with shutil.which). required_packages: Iterable[str] = () # Configuration the processor needs. Each entry is either a setting key, @@ -148,23 +165,29 @@ class Compatibility: # returns whether it is acceptable. required_settings: Iterable = () - # --- follow-up processors --- + # --- Follow-up processors --- # Processor types to recommend first as next steps for this processor's output. preferred_followups: Iterable[str] = () - # Processor types that should never be offered as follow-ups here, even - # when they are otherwise compatible. + # Processor types never SUGGESTED as follow-ups after this one -- a soft + # filter (affects the suggestion list only; is_compatible_with is unchanged, + # so they can still be run directly). Use for "a more specific processor is + # preferred here" (e.g. tiktok-search excludes the generic video-downloader). + # For "that processor would fail on this output", use its excluded_types (hard). excluded_followups: Iterable[str] = () @property - def requires_dataset(self) -> bool: + def requires_dataset_result_file(self) -> bool: """ - Whether fully evaluating this spec needs a materialized DataSet. + Whether fully evaluating this spec needs the dataset's produced data. True when the `rankable` or `requires_columns` axis is set: both are read from the produced result file, so they cannot be resolved from a - processor class alone. A consumer that reasons about processors without - real datasets (e.g. a processor map) can use this to mark those axes as - undecided rather than treating them as failed. + processor class alone. (Shape axes such as top_dataset/extension also read + instance state, but they are recoverable from a dataset's shape; only + these two need the produced data, so only they are counted here.) A + consumer that reasons about processors without real datasets (e.g. a + processor map) can use this to mark those axes as undecided rather than + treating them as failed. """ return self.rankable is not None or bool(self.requires_columns) @@ -188,8 +211,10 @@ def unmet_requirements(self, module, config=None, first_only=True) -> List[str]: The checks run in three tiers, cheapest first so the short-circuit can skip later work once something fails: - 1. shape -- type/extension/etc.; answerable from a processor class as - well as a real dataset (no file or system access); + 1. structural -- the dataset's shape (type, extension, parent, + datasource); cheap, no result-file read. (Several still read instance + state -- is_top_dataset() -> key_parent, get_extension(), parameters -- + so on a bare processor class they return a stub, not a real answer.) 2. dataset-required -- `rankable` and `requires_columns`, read from the produced result file, so they need a materialized DataSet (see `requires_dataset`); @@ -204,7 +229,7 @@ def unmet_requirements(self, module, config=None, first_only=True) -> List[str]: if module is None: return ["no dataset provided"] - # --- tier 1: shape (no DataSet needed; also answerable from a processor class) --- + # --- tier 1: structural shape (cheap; no result-file read) --- # if the processor names the kinds of dataset it accepts, the module # must be one of them @@ -213,11 +238,21 @@ def unmet_requirements(self, module, config=None, first_only=True) -> List[str]: if first_only: return reasons + if self.excluded_types and getattr(module, "type", None) in set(self.excluded_types): + reasons.append("does not run on dataset type: %s" % getattr(module, "type", None)) + if first_only: + return reasons + if self.top_dataset_only and not _maybe_call(module, "is_top_dataset"): reasons.append("requires a top-level dataset") if first_only: return reasons + if self.child_only and _maybe_call(module, "is_top_dataset"): + reasons.append("requires a child (non-top-level) dataset") + if first_only: + return reasons + if self.extensions is not None: extension = _maybe_call(module, "get_extension") if extension not in set(self.extensions): @@ -278,7 +313,7 @@ def unmet_requirements(self, module, config=None, first_only=True) -> List[str]: def _identity_declared(self) -> bool: """Whether the processor names any kind of dataset it accepts.""" - return any( + return self.is_collector or any( axis is not None for axis in (self.types, self.type_prefixes, self.media_types, self.datasources) ) @@ -304,4 +339,7 @@ def _identity_matches(self, module) -> bool: if isinstance(parameters, dict) and parameters.get("datasource") in set(self.datasources): return True + if self.is_collector and _maybe_call(module, "is_from_collector"): + return True + return False diff --git a/processors/conversion/extract_urls.py b/processors/conversion/extract_urls.py index b0ee80ba3..d95600874 100644 --- a/processors/conversion/extract_urls.py +++ b/processors/conversion/extract_urls.py @@ -12,6 +12,7 @@ from common.lib.exceptions import ProcessorInterruptedException from backend.lib.processor import BasicProcessor from common.lib.helpers import UserInput +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Stijn Peeters", "Dale Wahl", "Sal Hagen"] @@ -31,6 +32,9 @@ class ExtractURLs(BasicProcessor): description = "Extract any URLs from selected column(s) with the option to expand shortened URLs." extension = "csv" + # any csv/ndjson dataset, except this processor's own filter output + compatibility = Compatibility(extensions={"csv", "ndjson"}, excluded_types={"extract-urls-filter"}) + # taken from https://github.com/timleland/url-shorteners # current as of 9 April 2021 redirect_domains = ( @@ -153,15 +157,6 @@ class ExtractURLs(BasicProcessor): "api.parler.com", "trib.al", "fb.watch", ) - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - All processor on CSV and NDJSON datasets - - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.get_extension() in ["csv", "ndjson"] and module.type != "extract-urls-filter" - @classmethod def get_options(cls, parent_dataset=None, config=None): """ diff --git a/processors/conversion/merge_datasets.py b/processors/conversion/merge_datasets.py index 9bec40a63..a2022b57e 100644 --- a/processors/conversion/merge_datasets.py +++ b/processors/conversion/merge_datasets.py @@ -9,6 +9,7 @@ from common.lib.exceptions import ProcessorInterruptedException, DataSetException from common.lib.helpers import UserInput from common.lib.item_mapping import MappedItem +from common.lib.compatibility import Compatibility import ural __author__ = "Stijn Peeters" @@ -29,15 +30,8 @@ class DatasetMerger(BasicProcessor): description = "Merge this dataset with other datasets of the same format. A new dataset is " \ "created containing a combination of items from the original datasets." # description displayed in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on any top-level CSV or NDJSON file - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.get_extension() in ("csv", "ndjson") and (module.is_from_collector()) + # a collector's csv or ndjson output + compatibility = Compatibility(is_collector=True, extensions={"csv", "ndjson"}) @staticmethod def get_dataset_from_url(url, db, modules=None): diff --git a/processors/filtering/column_filter.py b/processors/filtering/column_filter.py index 8b0729d9c..9e1fb83c3 100644 --- a/processors/filtering/column_filter.py +++ b/processors/filtering/column_filter.py @@ -7,6 +7,7 @@ from backend.lib.processor import BasicProcessor from processors.filtering.base_filter import BaseFilter from common.lib.helpers import UserInput, convert_to_int +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters", "Dale Wahl"] @@ -24,16 +25,8 @@ class ColumnFilter(BaseFilter): description = ("A flexible and customizable filter that lets you retain items in selected column that match a " "custom requirement. This creates a new dataset.") - - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on top datasets that are CSV or NDJSON. - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") + # top-level csv/ndjson datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: @@ -264,15 +257,8 @@ class ColumnProcessorFilter(ColumnFilter): title = "Filter by value" # title displayed in UI description = "A generic filter that checks whether a value in a selected column matches a custom requirement. " - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on child datasets and do not create a standalone dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager config: Configuration reader (context-aware) - """ - return not module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") + # child (non-top-level) csv/ndjson datasets + compatibility = Compatibility(child_only=True, extensions={"csv", "ndjson"}) @classmethod def is_filter(cls): diff --git a/processors/metrics/top_images.py b/processors/metrics/top_images.py index 7698e77ce..a6d5a8d87 100644 --- a/processors/metrics/top_images.py +++ b/processors/metrics/top_images.py @@ -6,6 +6,7 @@ from collections import Counter, OrderedDict from backend.lib.processor import BasicProcessor from common.lib.helpers import UserInput +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -25,18 +26,8 @@ class TopImageCounter(BasicProcessor): description = "Collect all image URLs and sort by most-occurring." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["image-downloader"] - - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - All top-level datasets, excluding Telegram, which has a different image logic - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - - return module.is_top_dataset() and module.type != "telegram-search" and module.get_extension() in ("csv", "ndjson") + # top-level csv/ndjson datasets, except Telegram (which has its own image logic) + compatibility = Compatibility(top_dataset_only=True, excluded_types={"telegram-search"}, extensions={"csv", "ndjson"}, preferred_followups=["image-downloader"]) @classmethod def get_options(cls, parent_dataset=None, config=None): diff --git a/processors/metrics/youtube_metadata.py b/processors/metrics/youtube_metadata.py index e38b884b5..6dacf0fe3 100644 --- a/processors/metrics/youtube_metadata.py +++ b/processors/metrics/youtube_metadata.py @@ -11,6 +11,7 @@ from backend.lib.processor import BasicProcessor from common.lib.helpers import UserInput +from common.lib.compatibility import Compatibility __author__ = "Sal Hagen" __credits__ = ["Sal Hagen"] @@ -36,7 +37,8 @@ class YouTubeMetadata(BasicProcessor): "Uses the YouTube API.") # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["youtube-thumbnails"] + # collector output or extract-urls-filter output, as csv/ndjson (may contain youtube links) + compatibility = Compatibility(is_collector=True, types={"extract-urls-filter"}, extensions={"csv", "ndjson"}, preferred_followups=["youtube-thumbnails"]) max_retries = 3 sleep_time = 20 @@ -135,18 +137,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on datasets probably containing youtube links - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - # Compatible with every top-level dataset. - return ((module.is_top_dataset() and module.get_extension() in ("csv", "ndjson")) - or module.type == "extract-urls-filter") - def process(self): """ Writes a csv file with metadata of extracted YouTube objects. From 6f6421a3619769a9894f7e6f9b7846f28172ca74 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 17 Jun 2026 13:49:11 +0200 Subject: [PATCH 23/30] compatibility: base downloaders --- processors/visualisation/download_images.py | 27 +++++---------------- processors/visualisation/download_videos.py | 26 +++++--------------- 2 files changed, 12 insertions(+), 41 deletions(-) diff --git a/processors/visualisation/download_images.py b/processors/visualisation/download_images.py index 7bde41b03..bd13b21c7 100644 --- a/processors/visualisation/download_images.py +++ b/processors/visualisation/download_images.py @@ -14,6 +14,7 @@ from backend.lib.processor import BasicProcessor from backend.lib.proxied_requests import FailedProxiedRequest from common.lib.exceptions import ProcessorInterruptedException, FourcatException +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -44,6 +45,8 @@ class ImageDownloader(BasicProcessor): extension = "zip" # extension of result file, used internally and in UI media_type = "image" # media type of the dataset + # Shared list -- other download_* processors reuse this as ImageDownloader.followups + # (and preferred_followups below reuses it), so it stays a named attribute. followups = [ "image-wall", "image-category-wall", @@ -56,6 +59,9 @@ class ImageDownloader(BasicProcessor): "google-vision-api", ] + # top-image rankings or any collector's csv/ndjson output, except sources with their own image collection + compatibility = Compatibility(is_collector=True, types={"top-images"}, excluded_types={"tiktok-search", "tiktok-urls-search", "telegram-search", "fourchan-search"}, extensions={"csv", "ndjson"}, preferred_followups=followups) + config = { "image-downloader.max": { "type": UserInput.OPTION_TEXT, @@ -138,27 +144,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on top image rankings, collectors, but not specific collectors with their own image - collection methods - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return ( - (module.type == "top-images" or module.is_from_collector()) - and module.type - not in [ - "tiktok-search", - "tiktok-urls-search", - "telegram-search", - "fourchan-search", - ] - and module.get_extension() in ("csv", "ndjson") - ) - def process(self): """ This takes a 4CAT results file as input, and outputs a zip file with diff --git a/processors/visualisation/download_videos.py b/processors/visualisation/download_videos.py index d559006d6..86f0b7ed5 100644 --- a/processors/visualisation/download_videos.py +++ b/processors/visualisation/download_videos.py @@ -20,6 +20,7 @@ from backend.lib.processor import BasicProcessor from backend.lib.proxied_requests import FailedProxiedRequest +from common.lib.compatibility import Compatibility from common.lib.dataset import DataSet from common.lib.exceptions import ProcessorInterruptedException, ProcessorException, DataSetException from common.lib.helpers import UserInput, sets_to_lists, url_to_filename @@ -95,8 +96,13 @@ class VideoDownloaderPlus(BasicProcessor): extension = "zip" # extension of result file, used internally and in UI media_type = "video" # media type of the processor + # Shared list -- other download_* processors reuse this as VideoDownloaderPlus.followups + # (and preferred_followups below reuses it), so it stays a named attribute. followups = ["audio-extractor", "metadata-viewer", "video-scene-detector", "preset-scene-timelines", "video-stack", "preset-video-hashes", "video-hasher-1", "video-frames"] + # any collector's csv/ndjson output (except sources with their own downloaders), plus the tiktok-metadata helper + compatibility = Compatibility(is_collector=True, types={"tiktok-video-downloader-metadata"}, excluded_types={"tiktok-search", "tiktok-urls-search", "telegram-search"}, extensions={"csv", "ndjson"}, preferred_followups=followups) + references = [ "[YT-DLP python package](https://github.com/yt-dlp/yt-dlp/#readme)", "[Supported sites](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)", @@ -302,26 +308,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine compatibility - - Compatible with any top-level dataset. Could run on any type of dataset - in principle, but any links to videos are likely to come from the top - dataset anyway. - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return ((module.type.endswith("-search") or - module.is_from_collector() or - module.type == "tiktok-video-downloader-metadata") - # These have their own video downloaders - and module.type not in ["tiktok-search", "tiktok-urls-search", "telegram-search"]) \ - and module.get_extension() in ("csv", "ndjson") - def process(self): """ This takes a 4CAT results file as input, and downloads video files From e466c3f744d2acaa572c9c942894569a9df993e1 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 17 Jun 2026 13:50:20 +0200 Subject: [PATCH 24/30] compatibility: keep `is_compatible_with` overrides (for credentials), but add coarse map specs in compatbility --- processors/machine_learning/generate_images.py | 5 ++++- .../visualisation/download-telegram-images.py | 4 +++- .../visualisation/download_telegram_files.py | 8 +++++--- .../visualisation/download_telegram_videos.py | 5 ++++- processors/visualisation/download_tiktok_video.py | 15 +++++++-------- 5 files changed, 23 insertions(+), 14 deletions(-) diff --git a/processors/machine_learning/generate_images.py b/processors/machine_learning/generate_images.py index 7ca0d219e..273d0835c 100644 --- a/processors/machine_learning/generate_images.py +++ b/processors/machine_learning/generate_images.py @@ -11,6 +11,7 @@ from processors.visualisation.download_images import ImageDownloader from common.lib.dmi_service_manager import DmiServiceManager, DmiServiceManagerException, DsmOutOfMemory from common.lib.user_input import UserInput +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -28,7 +29,9 @@ class StableDiffusionImageGenerator(BasicProcessor): description = "Given a list of prompts, generates images using the Stable Diffusion XL image model." # description displayed in UI extension = "zip" # extension of result file, used internally and in UI - followups = ImageDownloader.followups + # coarse map spec; is_compatible_with (below) is the runtime truth -- it also requires the + # dataset to have columns (a prompt source), which can't be declared statically + compatibility = Compatibility(required_settings={"dmi-service-manager.sd_enabled", "dmi-service-manager.ab_server_address"}, preferred_followups=ImageDownloader.followups) references = [ "[Stable Diffusion XL 1.0 model card](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)" diff --git a/processors/visualisation/download-telegram-images.py b/processors/visualisation/download-telegram-images.py index 0d692b66c..d69c74f89 100644 --- a/processors/visualisation/download-telegram-images.py +++ b/processors/visualisation/download-telegram-images.py @@ -8,6 +8,7 @@ from common.lib.helpers import UserInput from processors.visualisation.download_images import ImageDownloader from processors.visualisation.download_telegram_videos import TelegramVideoDownloader +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -32,7 +33,8 @@ class TelegramImageDownloader(TelegramVideoDownloader): extension = "zip" media_type = "image" - followups = ImageDownloader.followups + # coarse map spec; is_compatible_with (below) is the runtime truth (Telegram API creds) + compatibility = Compatibility(types={"telegram-search"}, preferred_followups=ImageDownloader.followups) config = { "image-downloader-telegram.max": { diff --git a/processors/visualisation/download_telegram_files.py b/processors/visualisation/download_telegram_files.py index 4f5c2bead..20870953d 100644 --- a/processors/visualisation/download_telegram_files.py +++ b/processors/visualisation/download_telegram_files.py @@ -14,6 +14,7 @@ from common.lib.helpers import UserInput from processors.visualisation.download_telegram_videos import TelegramVideoDownloader +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -42,9 +43,10 @@ class TelegramFileDownloader(TelegramVideoDownloader): extension = "zip" media_type = "file" - # no followups -- file outputs are heterogeneous and don't map cleanly to - # existing video/image follow-on processors - followups = [] + # coarse map spec; is_compatible_with (below) is the runtime truth (Telegram API creds). + # No preferred_followups -- file outputs are heterogeneous and don't map cleanly to + # existing video/image follow-on processors. + compatibility = Compatibility(types={"telegram-search"}, required_settings={"file-downloader-telegram.allow_files"}) config = { "file-downloader-telegram.allow_files": { diff --git a/processors/visualisation/download_telegram_videos.py b/processors/visualisation/download_telegram_videos.py index 270113713..a463d0126 100644 --- a/processors/visualisation/download_telegram_videos.py +++ b/processors/visualisation/download_telegram_videos.py @@ -22,6 +22,7 @@ class attributes to switch behavior for a different media type. from processors.visualisation.download_videos import VideoDownloaderPlus from common.lib.helpers import UserInput, timify from common.lib.dataset import DataSet +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters", "Dale Wahl"] @@ -46,7 +47,9 @@ class TelegramVideoDownloader(BasicProcessor): media_type = "video" # media type of the result flawless = True - followups = VideoDownloaderPlus.followups + # coarse map spec; is_compatible_with (below) is the runtime truth -- it also checks the + # source dataset carries Telegram API credentials, which are read from the dataset + compatibility = Compatibility(types={"telegram-search"}, required_settings={"video-downloader-telegram.allow_videos"}, preferred_followups=VideoDownloaderPlus.followups) config = { "video-downloader-telegram.max_videos": { diff --git a/processors/visualisation/download_tiktok_video.py b/processors/visualisation/download_tiktok_video.py index 8103a0933..97a809602 100644 --- a/processors/visualisation/download_tiktok_video.py +++ b/processors/visualisation/download_tiktok_video.py @@ -12,6 +12,7 @@ from processors.visualisation.download_videos import VideoDownloaderPlus from backend.lib.processor import BasicProcessor from datasources.tiktok_urls.search_tiktok_urls import TikTokScraper +from common.lib.compatibility import Compatibility class TikTokVideoDownloader(ProcessorPreset): """ @@ -26,7 +27,9 @@ class TikTokVideoDownloader(ProcessorPreset): extension = "zip" media_type = "video" - followups = VideoDownloaderPlus.followups + # coarse map spec; is_compatible_with (below) is the runtime truth -- it also accepts + # tiktok uploads, which depends on the dataset label and can't be declared statically + compatibility = Compatibility(types={"tiktok-search", "tiktok-urls-search"}, preferred_followups=VideoDownloaderPlus.followups) @classmethod def get_options(cls, parent_dataset=None, config=None): @@ -125,13 +128,9 @@ class TikTokVideoMetadata(BasicProcessor): consecutive_failures = None - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Do not show anywhere - """ - return False - + # internal helper dataset; never offered as a processor + compatibility = Compatibility(types=set()) + @classmethod def get_options(cls, parent_dataset=None, config=None): """ From 7483df117b0403aac97f4537469adb46f7e0dc4d Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 17 Jun 2026 14:09:53 +0200 Subject: [PATCH 25/30] compatibility: couple more with overrides --- processors/conversion/export_datasets.py | 5 +++++ processors/networks/image-network.py | 4 ++++ .../visualisation/image_category_wall.py | 19 ++++--------------- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/processors/conversion/export_datasets.py b/processors/conversion/export_datasets.py index 83af5351c..2dc12308e 100644 --- a/processors/conversion/export_datasets.py +++ b/processors/conversion/export_datasets.py @@ -8,6 +8,7 @@ from backend.lib.processor import BasicProcessor from common.lib.dataset import DataSet from common.lib.exceptions import DataSetException +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -27,6 +28,10 @@ class ExportDatasets(BasicProcessor): "another 4CAT instance. Filters are not included. Results expire after one day.") # description displayed in UI extension = "zip" # extension of result file, used internally and in UI + # coarse map spec; is_compatible_with (below) is the runtime truth -- it also checks + # the requesting user owns the dataset (is_accessible_by), which is per-user, not shape + compatibility = Compatibility(top_dataset_only=True) + @classmethod def is_compatible_with(cls, module=None, config=None): """ diff --git a/processors/networks/image-network.py b/processors/networks/image-network.py index ab71e864d..ee573604b 100644 --- a/processors/networks/image-network.py +++ b/processors/networks/image-network.py @@ -15,6 +15,7 @@ from common.lib.exceptions import ProcessorInterruptedException from common.lib.user_input import UserInput +from common.lib.compatibility import Compatibility class ImageGrapher(BasicProcessor): @@ -33,6 +34,9 @@ class ImageGrapher(BasicProcessor): "'Image Preview' plugin.") extension = "gexf" # extension of result file, used internally and in UI + # coarse map spec; is_compatible_with (below) is the runtime truth -- it also walks the + # genealogy to find an image-downloader root (get_root_dataset) + compatibility = Compatibility(type_prefixes={"image-downloader"}) @classmethod def get_options(cls, parent_dataset=None, config=None): diff --git a/processors/visualisation/image_category_wall.py b/processors/visualisation/image_category_wall.py index 95acc9627..7dd9df5c2 100644 --- a/processors/visualisation/image_category_wall.py +++ b/processors/visualisation/image_category_wall.py @@ -17,6 +17,7 @@ from common.lib.helpers import UserInput, convert_to_int, get_4cat_canvas from backend.lib.processor import BasicProcessor from common.lib.exceptions import ProcessorInterruptedException +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl", "Stijn Peeters"] @@ -36,6 +37,9 @@ class ImageCategoryWallGenerator(BasicProcessor): description = "Combine images into a single image arranged by category" # description displayed in UI extension = "svg" # extension of result file, used internally and in UI + # image-category, image-downloader, or video-hash datasets (except screenshot downloads) + compatibility = Compatibility(type_prefixes={"image-to-categories", "image-downloader", "video-hasher-1", "video-hash-similarity-matrix"}, excluded_types={"image-downloader-screenshots-search"}) + number_of_ranges = 10 # number of ranges to use for numeric categories image_datasets = ["image-downloader", "video-hasher-1"] @@ -57,21 +61,6 @@ class ImageCategoryWallGenerator(BasicProcessor): } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on CLIP dataset only - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return ( - module.type.startswith("image-to-categories") - or module.type.startswith("image-downloader") - or module.type.startswith("video-hasher-1") - or module.type.startswith("video-hash-similarity-matrix") - ) and module.type not in ["image-downloader-screenshots-search"] - @classmethod def get_options(cls, parent_dataset=None, config=None): """ From 3c0da2d6e8f11411a7acc0a50e929ec4df18cdd3 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 17 Jun 2026 14:24:10 +0200 Subject: [PATCH 26/30] compatibility: requires ANY column (in addition to requires all columns) --- common/lib/compatibility.py | 44 ++++++++++++--------- processors/networks/cotag_network.py | 15 ++----- processors/networks/user_hashtag_network.py | 16 ++------ processors/presets/top-hashtags.py | 16 ++------ 4 files changed, 38 insertions(+), 53 deletions(-) diff --git a/common/lib/compatibility.py b/common/lib/compatibility.py index b4c9adc06..b4e74f0ea 100644 --- a/common/lib/compatibility.py +++ b/common/lib/compatibility.py @@ -152,8 +152,10 @@ class Compatibility: # Forwarded to is_rankable(multiple_items=...) when `rankable` is set. False # restricts to single-value rankings (rejecting multi-column word_1/word_2/... rankings). rankable_multiple_items: bool = True - # Columns that must all be present in the dataset, read from its columns. - requires_columns: Iterable[str] = () + # Columns that must ALL be present in the dataset, read from its columns. + requires_all_columns: Iterable[str] = () + # Columns of which AT LEAST ONE must be present, read from its columns. + requires_any_columns: Iterable[str] = () # --- Environment requirements --- # Executables that must be found on the system path (checked with shutil.which). @@ -180,16 +182,18 @@ def requires_dataset_result_file(self) -> bool: """ Whether fully evaluating this spec needs the dataset's produced data. - True when the `rankable` or `requires_columns` axis is set: both are read - from the produced result file, so they cannot be resolved from a - processor class alone. (Shape axes such as top_dataset/extension also read - instance state, but they are recoverable from a dataset's shape; only - these two need the produced data, so only they are counted here.) A - consumer that reasons about processors without real datasets (e.g. a - processor map) can use this to mark those axes as undecided rather than - treating them as failed. + True when `rankable`, `requires_all_columns`, or `requires_any_columns` is + set: all are read from the produced result file, so they cannot be + resolved from a processor class alone. (Shape axes such as + top_dataset/extension also read instance state, but they are recoverable + from a dataset's shape; only these need the produced data, so only they + are counted here.) A consumer that reasons about processors without real + datasets (e.g. a processor map) can use this to mark those axes as + undecided rather than treating them as failed. """ - return self.rankable is not None or bool(self.requires_columns) + return (self.rankable is not None + or bool(self.requires_all_columns) + or bool(self.requires_any_columns)) def is_compatible_with(self, module, config=None) -> bool: """ @@ -215,9 +219,9 @@ def unmet_requirements(self, module, config=None, first_only=True) -> List[str]: datasource); cheap, no result-file read. (Several still read instance state -- is_top_dataset() -> key_parent, get_extension(), parameters -- so on a bare processor class they return a stub, not a real answer.) - 2. dataset-required -- `rankable` and `requires_columns`, read from the - produced result file, so they need a materialized DataSet (see - `requires_dataset`); + 2. dataset-required -- `rankable`, `requires_all_columns`, and + `requires_any_columns`, read from the produced result file, so they + need a materialized DataSet (see `requires_dataset_result_file`); 3. environment -- configuration settings and system executables. By default the method returns as soon as one requirement is unmet -- @@ -261,7 +265,7 @@ def unmet_requirements(self, module, config=None, first_only=True) -> List[str]: return reasons # --- tier 2: dataset-required (read from the result file; cannot be - # resolved from a processor class -- see requires_dataset) --- + # resolved from a processor class -- see requires_dataset_result_file) --- if self.rankable is not None: if bool(_maybe_call(module, "is_rankable", multiple_items=self.rankable_multiple_items)) != self.rankable: @@ -272,11 +276,15 @@ def unmet_requirements(self, module, config=None, first_only=True) -> List[str]: if first_only: return reasons - if self.requires_columns: + if self.requires_all_columns or self.requires_any_columns: columns = _maybe_call(module, "get_columns") or [] - missing = [column for column in self.requires_columns if column not in columns] + missing = [column for column in self.requires_all_columns if column not in columns] if missing: - reasons.append("requires column(s): %s" % ", ".join(missing)) + reasons.append("requires all column(s): %s" % ", ".join(missing)) + if first_only: + return reasons + if self.requires_any_columns and not any(column in columns for column in self.requires_any_columns): + reasons.append("requires any of column(s): %s" % ", ".join(self.requires_any_columns)) if first_only: return reasons diff --git a/processors/networks/cotag_network.py b/processors/networks/cotag_network.py index 2464e3dd8..6e4ec3816 100644 --- a/processors/networks/cotag_network.py +++ b/processors/networks/cotag_network.py @@ -4,6 +4,7 @@ from backend.lib.preset import ProcessorPreset from common.lib.helpers import UserInput +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -26,6 +27,9 @@ class CoTaggerPreset(ProcessorPreset): possible_tag_columns = {"tags", "hashtags", "groups"} + # datasets with at least one tag-like column + compatibility = Compatibility(requires_any_columns=possible_tag_columns) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -53,17 +57,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on datasets containing a tags column - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - columns = module.get_columns() - return bool(set(columns) & cls.possible_tag_columns) if columns else False - def get_processor_pipeline(self): """ Generate co-tag graph of items diff --git a/processors/networks/user_hashtag_network.py b/processors/networks/user_hashtag_network.py index 7ec1d11b7..b21366cf7 100644 --- a/processors/networks/user_hashtag_network.py +++ b/processors/networks/user_hashtag_network.py @@ -3,6 +3,7 @@ """ from backend.lib.preset import ProcessorPreset from common.lib.user_input import UserInput +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" @@ -21,6 +22,9 @@ class HashtagUserBipartiteGrapherPreset(ProcessorPreset): description = "Produces a bipartite graph based on co-occurence of (hash)tags and authors. If someone wrote a post with a certain tag, there will be a link between that person and the tag. The more often they appear together, the stronger the link. Tag nodes are weighed on how often they occur. User nodes are weighed on how many posts they've made." # description displayed in UI extension = "gexf" # extension of result file, used internally and in UI + # datasets with at least one tag-like column + compatibility = Compatibility(requires_any_columns={"tags", "hashtags", "groups"}) + @classmethod def get_options(cls, parent_dataset=None, config=None): return { @@ -32,18 +36,6 @@ def get_options(cls, parent_dataset=None, config=None): } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on datasets containing a tags column - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - usable_columns = {"tags", "hashtags", "groups"} - columns = module.get_columns() - return bool(set(columns) & usable_columns) if columns else False - def get_processor_pipeline(self): """ Generate bipartite user-hashtag graph of items diff --git a/processors/presets/top-hashtags.py b/processors/presets/top-hashtags.py index 93a417693..dc79a784e 100644 --- a/processors/presets/top-hashtags.py +++ b/processors/presets/top-hashtags.py @@ -4,6 +4,7 @@ from backend.lib.preset import ProcessorPreset from common.lib.helpers import UserInput from processors.networks.cotag_network import CoTaggerPreset +from common.lib.compatibility import Compatibility class TopHashtags(ProcessorPreset): @@ -16,6 +17,9 @@ class TopHashtags(ProcessorPreset): description = "Count how often each hashtag occurs in the dataset and sort by this value" extension = "csv" + # datasets with at least one tag-like column + compatibility = Compatibility(requires_any_columns=CoTaggerPreset.possible_tag_columns) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -43,18 +47,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: }, } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Check if dataset has a hashtag attribute - - :param module: Dataset to check - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - columns = module.get_columns() - return columns and any([tag in columns for tag in CoTaggerPreset.possible_tag_columns]) - def get_processor_pipeline(self): """ This is basically a 'count values' processor with some defaults From 20dd2ff6b2133bd147326c63bd94c01c0436db7c Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 17 Jun 2026 14:31:10 +0200 Subject: [PATCH 27/30] video_hasher: easy compatibility --- processors/visualisation/video_hasher.py | 30 +++++------------------- 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/processors/visualisation/video_hasher.py b/processors/visualisation/video_hasher.py index f63120803..8271e7b69 100644 --- a/processors/visualisation/video_hasher.py +++ b/processors/visualisation/video_hasher.py @@ -16,7 +16,7 @@ from backend.lib.processor import BasicProcessor from backend.lib.preset import ProcessorAdvancedPreset -from common.lib.compatibility import Compatibility +from common.lib.compatibility import Compatibility, is_executable from common.lib.exceptions import ProcessorInterruptedException, ProcessorException from common.lib.user_input import UserInput @@ -36,6 +36,9 @@ class VideoHasherPreset(ProcessorAdvancedPreset): description = "Creates video hashes (64 bits/identifiers) to identify near duplicate videos in a dataset based on hash similarity. Uses video only. This process can take a long time depending on video length, amount, and frames per second." extension = "gexf" + # video datasets, when ffmpeg is available + compatibility = Compatibility(media_types={"video"}, type_prefixes={"video-downloader"}, required_settings={("video-downloader.ffmpeg_path", is_executable)}) + @classmethod def get_options(cls, parent_dataset=None, config=None): return { @@ -70,21 +73,6 @@ def get_options(cls, parent_dataset=None, config=None): } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine compatibility - - Compatible with downloaded videos, and not really anything else! - Additionally ffmpeg needs to be available. - - :param DataSet module: Module ID to determine compatibility with - :return bool: - """ - return (module.get_media_type() == "video" or module.type.startswith("video-downloader")) and \ - config.get("video-downloader.ffmpeg_path") and \ - shutil.which(config.get("video-downloader.ffmpeg_path")) - def get_processor_advanced_pipeline(self, attach_to=None): """ This queues a series of post-processors to visualise videos. @@ -146,7 +134,8 @@ class VideoHasher(BasicProcessor): extension = "zip" # extension of result file, used internally and in UI media_type = "image" # media type of the result - followups = ["video-hash-network", "video-hash-similarity-matrix"] + # video datasets (collages are made from video frames) + compatibility = Compatibility(media_types={"video"}, type_prefixes={"video-downloader"}, preferred_followups=["video-hash-network", "video-hash-similarity-matrix"]) @classmethod def get_options(cls, parent_dataset=None, config=None): @@ -175,13 +164,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on videos only - """ - return module.get_media_type() == "video" or module.type.startswith("video-downloader") - def process(self): """ This takes a zipped set of videos, uses https://pypi.org/project/videohash/ and https://ffmpeg.org/ to collect From 248622f15c90e8770afc0dd238d8081ada786f8b Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 17 Jun 2026 14:34:10 +0200 Subject: [PATCH 28/30] compatibilities w/ overrides --- processors/machine_learning/llm_prompter.py | 5 +++++ processors/machine_learning/prompt_compass.py | 9 +++++++-- processors/metrics/annotation_metadata.py | 5 +++++ processors/visualisation/image_wall_w_text.py | 5 +++++ 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/processors/machine_learning/llm_prompter.py b/processors/machine_learning/llm_prompter.py index c2bd0d02e..5b823daa4 100644 --- a/processors/machine_learning/llm_prompter.py +++ b/processors/machine_learning/llm_prompter.py @@ -18,6 +18,7 @@ from common.lib.helpers import UserInput, nthify, andify, remove_nuls, flatten_dict from common.lib.llm import LLMAdapter from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility class LLMPrompter(BasicProcessor): """ @@ -30,6 +31,10 @@ class LLMPrompter(BasicProcessor): "entity extraction, or OCR. Supported APIs include OpenAI, Google, Anthropic, Mistral, and DeepSeek.") extension = "ndjson" # extension of result file, used internally and in UI. In this case it's variable! + # coarse map spec; is_compatible_with (below) is the runtime truth -- it accepts csv/ndjson + # tables, OR zip archives of image/video/audio media (_almost_ all zips but not) + compatibility = Compatibility(extensions={"csv", "ndjson", "zip"}) + references = [ "[Törnberg, Petter. 2023. 'How to Use LLMs for Text Analysis.' arXiv:2307.13106.](https://arxiv.org/pdf/2307." "13106)", diff --git a/processors/machine_learning/prompt_compass.py b/processors/machine_learning/prompt_compass.py index 076bd916f..8fa6ef3f7 100644 --- a/processors/machine_learning/prompt_compass.py +++ b/processors/machine_learning/prompt_compass.py @@ -4,6 +4,7 @@ from backend.lib.preset import ProcessorPreset from common.lib.helpers import UserInput from common.lib.llm import LLMAdapter +from common.lib.compatibility import Compatibility from common.lib.exceptions import ( QueryParametersException, @@ -25,6 +26,10 @@ class PromptCompassRunner(ProcessorPreset): "original dataset as a new column.") extension = "ndjson" + # coarse map spec; is_compatible_with (below) is the runtime truth -- it also checks + # that LLM models are configured (get_available_models) + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + references = [ "This processor is an implementation of the stand-alone tool [PromptCompass](https://github.com/ErikBorra/PromptCompass) by Erik Borra.", "See the processor options for references to the sources of each prompt in the library." @@ -82,8 +87,8 @@ def get_available_models(config): return models - @staticmethod - def is_compatible_with(module=None, config=None): + @classmethod + def is_compatible_with(cls, module=None, config=None): """ Determine compatibility diff --git a/processors/metrics/annotation_metadata.py b/processors/metrics/annotation_metadata.py index d57a41e48..21fb091ba 100644 --- a/processors/metrics/annotation_metadata.py +++ b/processors/metrics/annotation_metadata.py @@ -3,6 +3,7 @@ """ from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from datetime import datetime @@ -17,6 +18,10 @@ class AnnotationMetadata(BasicProcessor): "Includes annotation author, timestamp, type, etc.") # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # coarse map spec (accepts any dataset); is_compatible_with (below) is the runtime + # truth -- it requires the dataset to actually have annotations (annotation_fields) + compatibility = Compatibility() + @classmethod def is_compatible_with(cls, module=None, config=None): """ diff --git a/processors/visualisation/image_wall_w_text.py b/processors/visualisation/image_wall_w_text.py index cdc096323..71b4f753e 100644 --- a/processors/visualisation/image_wall_w_text.py +++ b/processors/visualisation/image_wall_w_text.py @@ -18,6 +18,7 @@ from common.lib.helpers import UserInput, convert_to_int, get_4cat_canvas from backend.lib.processor import BasicProcessor from common.lib.exceptions import ProcessorInterruptedException +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl", "Stijn Peeters"] @@ -39,6 +40,10 @@ class ImageTextWallGenerator(BasicProcessor): caption_datasets = ["image-captions", "text-from-images"] combined_dataset = ["image-downloader-stable-diffusion"] + # coarse map spec; is_compatible_with (below) is the runtime truth -- it walks the + # genealogy (identity_dataset_types) to confirm both an image and a text/caption dataset + compatibility = Compatibility(types=set(combined_dataset), type_prefixes=set(caption_datasets)) + @classmethod def is_compatible_with(cls, module=None, config=None): """ From 3302f48bccf8cd5c578d659697c5a8140d8da241 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 17 Jun 2026 14:54:10 +0200 Subject: [PATCH 29/30] compatibility cleanup --- datasources/audio_to_text/audio_to_text.py | 5 ++--- processors/machine_learning/audio_to_text.py | 2 -- processors/machine_learning/perspective.py | 4 ++++ 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/datasources/audio_to_text/audio_to_text.py b/datasources/audio_to_text/audio_to_text.py index 42ca241bd..ead409088 100644 --- a/datasources/audio_to_text/audio_to_text.py +++ b/datasources/audio_to_text/audio_to_text.py @@ -16,9 +16,8 @@ class AudioUploadToText(SearchMedia): title = "Convert speech to text" # title displayed in UI description = "Upload your own audio and use OpenAI's Whisper or GPT models to create transcripts" # description displayed in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - return AudioToText.is_compatible_with(module=module, config=config) + # reuse the AudioToText processor's compatibility -- this datasource runs it on uploaded audio + compatibility = AudioToText.compatibility @classmethod def get_options(cls, *args, **kwargs): diff --git a/processors/machine_learning/audio_to_text.py b/processors/machine_learning/audio_to_text.py index 6eb6b1a3a..ed4078ba0 100644 --- a/processors/machine_learning/audio_to_text.py +++ b/processors/machine_learning/audio_to_text.py @@ -33,8 +33,6 @@ class AudioToText(BasicProcessor): # Allow on audio datasets compatibility = Compatibility(media_types={"audio"}, type_prefixes={"audio-extractor"}) - followups = [] - references = [ "[OpenAI Whisper blog](https://openai.com/research/whisper)", "[OpenAI speech to text](https://github.com/openai/whisper/blob/248b6cb124225dd263bb9bd32d060b6517e067f8/whisper" diff --git a/processors/machine_learning/perspective.py b/processors/machine_learning/perspective.py index f8d1467fd..4b88035d3 100644 --- a/processors/machine_learning/perspective.py +++ b/processors/machine_learning/perspective.py @@ -9,6 +9,7 @@ from backend.lib.processor import BasicProcessor from googleapiclient import discovery from common.lib.item_mapping import MappedItem +from common.lib.compatibility import Compatibility class Perspective(BasicProcessor): """ @@ -21,6 +22,9 @@ class Perspective(BasicProcessor): "including 'toxicity', 'insult', and 'profanity'.") # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI + # top-level text datasets (scores text columns via the Perspective API) + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + references = [ "[Perspective API documentation](https://developers.perspectiveapi.com/s/about-the-api)", "[Rieder, Bernhard, and Yarden Skop. 2021. 'The fabrics of machine moderation: Studying the technical, " From 0fc03f7bbe6cfccb9c6b6e0832e633271be51a25 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 17 Jun 2026 15:13:04 +0200 Subject: [PATCH 30/30] clean up hasattr is_compatible_with checks --- common/lib/dataset.py | 8 +++----- webtool/views/api_standalone.py | 2 +- webtool/views/api_tool.py | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/common/lib/dataset.py b/common/lib/dataset.py index e917a1aba..358d8a934 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -2027,11 +2027,9 @@ def get_compatible_processors(self, config=None): ): continue - # consider a processor compatible if its is_compatible_with - # method returns True *or* if it has no explicit compatibility - # check and this dataset is top-level (i.e. has no parent) - if (not hasattr(processor, "is_compatible_with") and not self.key_parent) \ - or (hasattr(processor, "is_compatible_with") and processor.is_compatible_with(self, config=config)): + # evaluates processcor's declarative `compatibility` + # undeclared processors default to top-level-only + if processor.is_compatible_with(self, config=config): available[processor_type] = processor return available diff --git a/webtool/views/api_standalone.py b/webtool/views/api_standalone.py index 796562fc6..0362171ac 100644 --- a/webtool/views/api_standalone.py +++ b/webtool/views/api_standalone.py @@ -137,7 +137,7 @@ def is_rankable(self, multiple_items=False): continue # Check if the processor is compatible with the fake dataset - if hasattr(processor, "is_compatible_with") and not processor.is_compatible_with(fake_dataset): + if not processor.is_compatible_with(fake_dataset, g.config): continue available_processors[processor_type] = processor diff --git a/webtool/views/api_tool.py b/webtool/views/api_tool.py index b4068d361..d1488d997 100644 --- a/webtool/views/api_tool.py +++ b/webtool/views/api_tool.py @@ -209,7 +209,7 @@ def get_processor_options(processor_type, dataset_id=None): return error(404, message="Dataset '%s' does not exist" % dataset_id) # Check compatibility of processor with dataset - if hasattr(processor, "is_compatible_with") and not processor.is_compatible_with(dataset, g.config): + if not processor.is_compatible_with(dataset, g.config): return error(422, message="Processor '%s' is not compatible with dataset '%s'" % (processor_type, dataset_id)) worker_options = processor.get_options(dataset, g.config)