diff --git a/backend/lib/processor.py b/backend/lib/processor.py index b4f1c05aa..3b5d1a4ce 100644 --- a/backend/lib/processor.py +++ b/backend/lib/processor.py @@ -17,6 +17,7 @@ from backend.lib.worker import BasicWorker from common.lib.dataset import DataSet, StatusType +from common.lib.compatibility import Compatibility from common.lib.fourcat_module import FourcatModule from common.lib.helpers import get_software_commit, remove_nuls, send_email, hash_to_md5 from common.lib.exceptions import (WorkerInterruptedException, ProcessorInterruptedException, ProcessorException, @@ -37,10 +38,18 @@ class BasicProcessor(FourcatModule, BasicWorker, metaclass=abc.ABCMeta): be used as input for another processor (though whether and when this is useful is another question). - To determine whether a processor can process a given dataset, you can - define a `is_compatible_with(FourcatModule module=None, config=None):) -> bool` class - method which takes a dataset as argument and returns a bool that determines - if this processor is considered compatible with that dataset. For example: + To determine whether a processor can process a given dataset, declare a + Compatibility specification as the `compatibility` class attribute. The + default `is_compatible_with` is evaluated from it. For example: + + .. code-block:: python + + compatibility = Compatibility(types={"linguistic-features"}) + + Processors with genuinely dynamic requirements (e.g. ones that must inspect + a dataset's genealogy) may instead override `is_compatible_with(cls, + module=None, config=None) -> bool` directly; an override takes precedence + over the `compatibility` attribute. For example: .. code-block:: python @@ -97,6 +106,11 @@ def is_compatible_with(cls, module=None, config=None): #: `remove_disposable_files()` method will be called. for_cleanup = None + #: A common.lib.compatibility.Compatibility object describing which datasets + #: this processor accepts. When set, the default is_compatible_with() is + #: evaluated from it. + compatibility = None + def work(self): """ Process a dataset @@ -973,6 +987,33 @@ def _validate_map_item_post_run(self): except Exception: pass + @classmethod + def is_compatible_with(cls, module=None, config=None): + """ + Determine whether this processor can run on a given module. + + When the processor defines a `compatibility` attribute, this is + evaluated from it. Processors whose requirements cannot be expressed + that way (for example, ones that must inspect a dataset's ancestry) may + override this method instead; the override is used in preference to the + attribute. + + When neither is provided, the processor accepts only top-level datasets + (those without a parent), which preserves the historical default. + + :param module: Dataset (normally) or processor to check against + :param ConfigManager|None config: Context-aware configuration reader + :return bool: + """ + if cls.compatibility is not None: + return cls.compatibility.is_compatible_with(module, config=config) + + # Legacy default: a processor that declares no `compatibility` and does + # not override this method is compatible only with top-level datasets + # (those with no parent), i.e. it runs on collected data and not on the + # output of other processors. + return Compatibility(top_dataset_only=True).is_compatible_with(module, config=config) + @classmethod def is_filter(cls): """ @@ -1075,19 +1116,18 @@ def is_rankable(cls, multiple_items=True): @classmethod def exclude_followup_processors(cls, processor_type=None): """ - Used for processor compatibility - - To be defined by the child processor if it should exclude certain follow-up processors. - e.g.: + Determine whether a follow-up processor should be excluded. - def exclude_followup_processors(cls, processor_type): - if processor_type in ["undesirable-followup-processor"]: - return True - return False + Follow-up processors that should never be offered after this one are + listed in the `excluded_followups` field of the `compatibility` + specification. Processors with dynamic exclusion logic may override this + method instead. - :param str processor_type: Processor type to exclude - :return bool: True if processor should be excluded, False otherwise + :param str processor_type: Processor type to check + :return bool: True if the follow-up should be excluded, False otherwise """ + if cls.compatibility is not None and processor_type in cls.compatibility.excluded_followups: + return True return False @abc.abstractmethod diff --git a/common/lib/compatibility.py b/common/lib/compatibility.py new file mode 100644 index 000000000..b4e74f0ea --- /dev/null +++ b/common/lib/compatibility.py @@ -0,0 +1,353 @@ +""" +Declarative processor compatibility. + +A Compatibility object describes the conditions under which a processor can run +on a dataset: + +* the data shape it consumes -- the dataset's type, file extension, media type, + datasource, and any columns it needs; +* the environment it needs -- external executables and 4CAT configuration + settings; +* the follow-up processors that are most relevant for its output, and any that + should never be offered. + +A processor declares one as its `compatibility` class attribute, for example:: + + compatibility = Compatibility( + media_types={"video"}, + type_prefixes={"video-downloader"}, + required_settings={("video-downloader.ffmpeg_path", is_executable)}, + ) + +BasicProcessor.is_compatible_with() evaluates it. A processor whose +requirements cannot be expressed this way -- for example one that must inspect +a dataset's ancestry -- may override is_compatible_with() instead; the override +is used in preference to the attribute. + +`_maybe_call`: a utility function to safely read attributes or call methods on a + module, handling cases where the attribute or method might not exist or raise an exception. +Normally a `module` is a DataSet, but the values read here (its type, extension, media type +and so on) are also available on a processor class, so a processor can be checked even when +no dataset exists yet. +""" +from __future__ import annotations + +import shutil +from dataclasses import dataclass +from typing import Iterable, List, Optional + + +def _maybe_call(module, method, **kwargs): + """ + Read `module.method` without assuming it exists. + + Calls it and returns the result when it is a method, returns the value when + it is a plain attribute, and returns None when it is missing or raises. A + DataSet exposes these as methods; a processor class exposes some of them as + well, and this keeps the same check working for both. Any keyword arguments + are forwarded to the call (e.g. is_rankable(multiple_items=False)). + """ + attr = getattr(module, method, None) + if attr is None: + return None + if callable(attr): + try: + return attr(**kwargs) + except Exception: + return None + return attr + + +# TODO: memoize shutil.which() (used by is_executable / ExecutableSibling) -- its +# result is constant per process, so a cached wrapper (positive results only, to +# avoid stale negatives if an executable is installed without a restart) would +# avoid repeated $PATH scans on video-heavy pages. +def is_executable(path): + """ + Matcher for `required_settings`: the setting's value must point to an + executable found on the system (resolved with `shutil.which`). An unset or + empty value fails safely, e.g.:: + + required_settings={("video-downloader.ffmpeg_path", is_executable)} + """ + return bool(path) and shutil.which(path) is not None + + +class ExecutableSibling: + """ + Matcher for `required_settings`: the configured executable must resolve (via + `shutil.which`) AND a sibling executable must exist next to it, found by + swapping the name in the resolved path. For tools that ship together, e.g. + ffprobe alongside ffmpeg:: + + required_settings={("video-downloader.ffmpeg_path", + ExecutableSibling("ffmpeg", "ffprobe"))} + + The matcher protocol is a one-argument callable, so arguments are passed via + the constructor; `name`/`sibling` stay readable for a future UI. None-safe. + """ + + def __init__(self, name, sibling): + self.name = name + self.sibling = sibling + + def __call__(self, path): + resolved = shutil.which(path) if path else None + if not resolved: + return False + return shutil.which(self.sibling.join(resolved.rsplit(self.name, 1))) is not None + + +@dataclass +class Compatibility: + """ + Declarative compatibility specification for a processor. + + Any axis left unset (None, or empty) is not checked. + + The four identity axes -- types, type_prefixes, media_types and + datasources -- describe what kind of dataset the processor accepts. If any + of them are set, the module must match at least one (they are OR-ed). + Every other axis is an additional requirement that must also hold (they are + AND-ed). + """ + + # --- Identity axes: consumed data shape (the module must match one of these) --- + # Dataset types the processor accepts, matched exactly. + types: Optional[Iterable[str]] = None + # Dataset type prefixes the processor accepts, matched with str.startswith. + type_prefixes: Optional[Iterable[str]] = None + # Media types the processor accepts, e.g. {"video", "image", "audio", "text"}. + media_types: Optional[Iterable[str]] = None + # Datasources the processor accepts, e.g. {"4chan", "reddit"}. + datasources: Optional[Iterable[str]] = None + # Collectors types (a dataset whose type ends in -search or -import) + # Compare with top_dataset_only, which reads key_parent; the two cover nearly + # the same datasets but differ in role (identity/OR vs gate/AND) + is_collector: bool = False + + # --- Shape gates (structural checks) --- + # Parent dataset types this processor CANNOT run on -- a hard gate + # (is_compatible_with returns False). Use when the processor would fail or + # produce garbage on that type (e.g. download_videos on telegram-search). + # For a soft filter use excluded_followups on the producer instead. + excluded_types: Iterable[str] = () + + # --- DataSet required gates --- + # The ground truth requires an existing DataSet to read its produced data + # TODO: processors could declare these attributes more explicitly + # When True, the processor only accepts a top-level dataset (one with no parent). + top_dataset_only: bool = False + # When True, the processor only accepts a non-top-level (child) dataset -- the + # inverse of top_dataset_only. + child_only: bool = False + # Result-file extensions the processor accepts, e.g. {"csv", "ndjson"}. + extensions: Optional[Iterable[str]] = None + + # --- Result file gates (reading the actual file is required) + # TODO: processors again could point to these attributes more explicitly if known + # dataset and cannot be resolved from a processor class -- see requires_dataset) --- + # When set, is_rankable() must equal this (read from the result file). None = not checked. + rankable: Optional[bool] = None + # Forwarded to is_rankable(multiple_items=...) when `rankable` is set. False + # restricts to single-value rankings (rejecting multi-column word_1/word_2/... rankings). + rankable_multiple_items: bool = True + # Columns that must ALL be present in the dataset, read from its columns. + requires_all_columns: Iterable[str] = () + # Columns of which AT LEAST ONE must be present, read from its columns. + requires_any_columns: Iterable[str] = () + + # --- Environment requirements --- + # Executables that must be found on the system path (checked with shutil.which). + required_packages: Iterable[str] = () + # Configuration the processor needs. Each entry is either a setting key, + # which must resolve to a truthy value, or a (key, expected) pair. The + # expected part may be a single value the setting must equal, a collection + # the setting's value must be in, or a function that receives the value and + # returns whether it is acceptable. + required_settings: Iterable = () + + # --- Follow-up processors --- + # Processor types to recommend first as next steps for this processor's output. + preferred_followups: Iterable[str] = () + # Processor types never SUGGESTED as follow-ups after this one -- a soft + # filter (affects the suggestion list only; is_compatible_with is unchanged, + # so they can still be run directly). Use for "a more specific processor is + # preferred here" (e.g. tiktok-search excludes the generic video-downloader). + # For "that processor would fail on this output", use its excluded_types (hard). + excluded_followups: Iterable[str] = () + + @property + def requires_dataset_result_file(self) -> bool: + """ + Whether fully evaluating this spec needs the dataset's produced data. + + True when `rankable`, `requires_all_columns`, or `requires_any_columns` is + set: all are read from the produced result file, so they cannot be + resolved from a processor class alone. (Shape axes such as + top_dataset/extension also read instance state, but they are recoverable + from a dataset's shape; only these need the produced data, so only they + are counted here.) A consumer that reasons about processors without real + datasets (e.g. a processor map) can use this to mark those axes as + undecided rather than treating them as failed. + """ + return (self.rankable is not None + or bool(self.requires_all_columns) + or bool(self.requires_any_columns)) + + def is_compatible_with(self, module, config=None) -> bool: + """ + Return whether `module` meets every requirement in this specification. + + `module` is normally a DataSet but may be a processor class. `config` + is the configuration reader, or None when none is available. + """ + return not self.unmet_requirements(module, config=config) + + def unmet_requirements(self, module, config=None, first_only=True) -> List[str]: + """ + Return the requirements `module` does not meet, as readable strings. + + An empty list means `module` is compatible. Each string names one thing + that is missing -- a wrong dataset type, an absent column, a setting + that is not configured, and so on. + + The checks run in three tiers, cheapest first so the short-circuit can + skip later work once something fails: + + 1. structural -- the dataset's shape (type, extension, parent, + datasource); cheap, no result-file read. (Several still read instance + state -- is_top_dataset() -> key_parent, get_extension(), parameters -- + so on a bare processor class they return a stub, not a real answer.) + 2. dataset-required -- `rankable`, `requires_all_columns`, and + `requires_any_columns`, read from the produced result file, so they + need a materialized DataSet (see `requires_dataset_result_file`); + 3. environment -- configuration settings and system executables. + + By default the method returns as soon as one requirement is unmet -- + enough for the yes/no `is_compatible_with`. Pass `first_only=False` to + collect every unmet requirement -- used to explain why a module is not + compatible. + """ + reasons: List[str] = [] + if module is None: + return ["no dataset provided"] + + # --- tier 1: structural shape (cheap; no result-file read) --- + + # if the processor names the kinds of dataset it accepts, the module + # must be one of them + if self._identity_declared() and not self._identity_matches(module): + reasons.append("dataset type/media is not accepted") + if first_only: + return reasons + + if self.excluded_types and getattr(module, "type", None) in set(self.excluded_types): + reasons.append("does not run on dataset type: %s" % getattr(module, "type", None)) + if first_only: + return reasons + + if self.top_dataset_only and not _maybe_call(module, "is_top_dataset"): + reasons.append("requires a top-level dataset") + if first_only: + return reasons + + if self.child_only and _maybe_call(module, "is_top_dataset"): + reasons.append("requires a child (non-top-level) dataset") + if first_only: + return reasons + + if self.extensions is not None: + extension = _maybe_call(module, "get_extension") + if extension not in set(self.extensions): + reasons.append("requires extension: %s" % ", ".join(self.extensions)) + if first_only: + return reasons + + # --- tier 2: dataset-required (read from the result file; cannot be + # resolved from a processor class -- see requires_dataset_result_file) --- + + if self.rankable is not None: + if bool(_maybe_call(module, "is_rankable", multiple_items=self.rankable_multiple_items)) != self.rankable: + reasons.append( + "requires a rankable dataset" if self.rankable + else "requires a non-rankable dataset" + ) + if first_only: + return reasons + + if self.requires_all_columns or self.requires_any_columns: + columns = _maybe_call(module, "get_columns") or [] + missing = [column for column in self.requires_all_columns if column not in columns] + if missing: + reasons.append("requires all column(s): %s" % ", ".join(missing)) + if first_only: + return reasons + if self.requires_any_columns and not any(column in columns for column in self.requires_any_columns): + reasons.append("requires any of column(s): %s" % ", ".join(self.requires_any_columns)) + if first_only: + return reasons + + # --- tier 3: environment (needs config/system, not a DataSet; the + # executable matchers here can be expensive, so this tier runs last. + # TODO: cheap setting reads could be split out ahead of those matchers) --- + for requirement in self.required_settings: + key, expected = (requirement, None) if isinstance(requirement, str) else requirement + value = config.get(key) if config is not None else None + # no expected value; just check that the setting is truthy + if expected is None: + met = bool(value) + # a function that validates the value (e.g. is_executable / ExecutableSibling) + elif callable(expected): + met = bool(expected(value)) + # a collection of acceptable values + elif isinstance(expected, (set, frozenset, list, tuple)): + met = value in expected + # a single expected value + else: + met = value == expected + if not met: + reasons.append("requires setting: %s" % key) + if first_only: + return reasons + + for package in self.required_packages: + if not shutil.which(package): + reasons.append("requires package: %s" % package) + if first_only: + return reasons + + return reasons + + def _identity_declared(self) -> bool: + """Whether the processor names any kind of dataset it accepts.""" + return self.is_collector or any( + axis is not None + for axis in (self.types, self.type_prefixes, self.media_types, self.datasources) + ) + + def _identity_matches(self, module) -> bool: + """Whether the module is one of the kinds of dataset the processor accepts.""" + module_type = getattr(module, "type", None) + + if self.types is not None and module_type in set(self.types): + return True + + if self.type_prefixes is not None and module_type is not None \ + and any(module_type.startswith(prefix) for prefix in self.type_prefixes): + return True + + if self.media_types is not None: + media = _maybe_call(module, "get_media_type") or getattr(module, "media_type", None) + if media in set(self.media_types): + return True + + if self.datasources is not None: + parameters = getattr(module, "parameters", None) or {} + if isinstance(parameters, dict) and parameters.get("datasource") in set(self.datasources): + return True + + if self.is_collector and _maybe_call(module, "is_from_collector"): + return True + + return False diff --git a/common/lib/dataset.py b/common/lib/dataset.py index e917a1aba..358d8a934 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -2027,11 +2027,9 @@ def get_compatible_processors(self, config=None): ): continue - # consider a processor compatible if its is_compatible_with - # method returns True *or* if it has no explicit compatibility - # check and this dataset is top-level (i.e. has no parent) - if (not hasattr(processor, "is_compatible_with") and not self.key_parent) \ - or (hasattr(processor, "is_compatible_with") and processor.is_compatible_with(self, config=config)): + # evaluates processcor's declarative `compatibility` + # undeclared processors default to top-level-only + if processor.is_compatible_with(self, config=config): available[processor_type] = processor return available diff --git a/datasources/audio_to_text/audio_to_text.py b/datasources/audio_to_text/audio_to_text.py index 42ca241bd..ead409088 100644 --- a/datasources/audio_to_text/audio_to_text.py +++ b/datasources/audio_to_text/audio_to_text.py @@ -16,9 +16,8 @@ class AudioUploadToText(SearchMedia): title = "Convert speech to text" # title displayed in UI description = "Upload your own audio and use OpenAI's Whisper or GPT models to create transcripts" # description displayed in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - return AudioToText.is_compatible_with(module=module, config=config) + # reuse the AudioToText processor's compatibility -- this datasource runs it on uploaded audio + compatibility = AudioToText.compatibility @classmethod def get_options(cls, *args, **kwargs): diff --git a/processors/audio/audio_extractor.py b/processors/audio/audio_extractor.py index 91c68fc7b..637798836 100644 --- a/processors/audio/audio_extractor.py +++ b/processors/audio/audio_extractor.py @@ -10,6 +10,7 @@ import oslex from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility, is_executable from common.lib.exceptions import ProcessorInterruptedException __author__ = "Dale Wahl" @@ -33,18 +34,8 @@ class AudioExtractor(BasicProcessor): extension = "zip" # extension of result file, used internally and in UI media_type = "audio" - followups = ["audio-to-text"] - - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on videos only - - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return (module.get_media_type() == "video" or module.type.startswith("video-downloader")) and \ - config.get("video-downloader.ffmpeg_path") and \ - shutil.which(config.get("video-downloader.ffmpeg_path")) + # Allow on video datasets when ffmpeg is available + compatibility = Compatibility(media_types={"video"}, type_prefixes={"video-downloader"}, required_settings={("video-downloader.ffmpeg_path", is_executable)}, preferred_followups=["audio-to-text"]) @classmethod def get_options(cls, parent_dataset=None, config=None): diff --git a/processors/conversion/clarifai_to_csv.py b/processors/conversion/clarifai_to_csv.py index 760b5c9c8..6aeb08168 100644 --- a/processors/conversion/clarifai_to_csv.py +++ b/processors/conversion/clarifai_to_csv.py @@ -4,6 +4,7 @@ import csv from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -27,15 +28,8 @@ class ConvertClarifaiOutputToCSV(BasicProcessor): description = "Convert the Clarifai API output to a simplified CSV file." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible - - :param module: Module determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "clarifai-api" + # Allow processor on Clarifai API output + compatibility = Compatibility(types={"clarifai-api"}) def process(self): """ diff --git a/processors/conversion/consolidate_urls.py b/processors/conversion/consolidate_urls.py index 800433647..e3de3d5b6 100644 --- a/processors/conversion/consolidate_urls.py +++ b/processors/conversion/consolidate_urls.py @@ -8,6 +8,7 @@ from processors.conversion.extract_urls import ExtractURLs from common.lib.exceptions import ProcessorInterruptedException from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput, split_urls __author__ = "Dale Wahl" @@ -28,6 +29,9 @@ class ConsolidateURLs(BasicProcessor): description = "Retain only the domain (and optionally path) of URLs; used for custom networks (e.g. author + domains)" extension = "csv" + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) + # Common domain prefaces to remove domain_prefaces = ["m", "www"] # Domain dictionary (after domain_prefaces are removed) with additional rules based on URL components to conform to "clean URLs" @@ -232,7 +236,7 @@ def get_options(cls, parent_dataset=None, config=None): "requires": "method==custom" }, } - + # Get the columns for the select columns option if parent_dataset: columns = parent_dataset.get_columns() # call once @@ -247,16 +251,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - This is meant to be inherited by other child classes - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.get_extension() in ["csv", "ndjson"] - def process(self): method = self.parameters.get("method", False) url_parsing_issues = [] diff --git a/processors/conversion/convert_text.py b/processors/conversion/convert_text.py index 5afd4396c..007c245e6 100644 --- a/processors/conversion/convert_text.py +++ b/processors/conversion/convert_text.py @@ -6,6 +6,7 @@ from common.lib.exceptions import ProcessorInterruptedException from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Sal Hagen" @@ -25,6 +26,9 @@ class ConvertText(BasicProcessor): "also be added to the original dataset as annotations.") # description displayed in UI extension = "csv" + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: @@ -83,16 +87,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: k: "text" in k).pop() return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on NDJSON and CSV files - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.get_extension() in ("csv", "ndjson") - def process(self): """ Create a generator to iterate through items that can be passed to create either a csv or ndjson. diff --git a/processors/conversion/csv_to_json.py b/processors/conversion/csv_to_json.py index aef900379..418bfd05e 100644 --- a/processors/conversion/csv_to_json.py +++ b/processors/conversion/csv_to_json.py @@ -4,6 +4,7 @@ import json from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -20,16 +21,8 @@ class ConvertCSVToJSON(BasicProcessor): description = "Change a CSV file to a JSON file" # description displayed in UI extension = "json" # extension of result file, used internally and in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with a dataset or processor - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - - return module.get_extension() == "csv" + # Allow on CSV datasets + compatibility = Compatibility(extensions={"csv"}) def process(self): """ diff --git a/processors/conversion/export_datasets.py b/processors/conversion/export_datasets.py index 83af5351c..2dc12308e 100644 --- a/processors/conversion/export_datasets.py +++ b/processors/conversion/export_datasets.py @@ -8,6 +8,7 @@ from backend.lib.processor import BasicProcessor from common.lib.dataset import DataSet from common.lib.exceptions import DataSetException +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -27,6 +28,10 @@ class ExportDatasets(BasicProcessor): "another 4CAT instance. Filters are not included. Results expire after one day.") # description displayed in UI extension = "zip" # extension of result file, used internally and in UI + # coarse map spec; is_compatible_with (below) is the runtime truth -- it also checks + # the requesting user owns the dataset (is_accessible_by), which is per-user, not shape + compatibility = Compatibility(top_dataset_only=True) + @classmethod def is_compatible_with(cls, module=None, config=None): """ diff --git a/processors/conversion/extract_urls.py b/processors/conversion/extract_urls.py index b0ee80ba3..d95600874 100644 --- a/processors/conversion/extract_urls.py +++ b/processors/conversion/extract_urls.py @@ -12,6 +12,7 @@ from common.lib.exceptions import ProcessorInterruptedException from backend.lib.processor import BasicProcessor from common.lib.helpers import UserInput +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Stijn Peeters", "Dale Wahl", "Sal Hagen"] @@ -31,6 +32,9 @@ class ExtractURLs(BasicProcessor): description = "Extract any URLs from selected column(s) with the option to expand shortened URLs." extension = "csv" + # any csv/ndjson dataset, except this processor's own filter output + compatibility = Compatibility(extensions={"csv", "ndjson"}, excluded_types={"extract-urls-filter"}) + # taken from https://github.com/timleland/url-shorteners # current as of 9 April 2021 redirect_domains = ( @@ -153,15 +157,6 @@ class ExtractURLs(BasicProcessor): "api.parler.com", "trib.al", "fb.watch", ) - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - All processor on CSV and NDJSON datasets - - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.get_extension() in ["csv", "ndjson"] and module.type != "extract-urls-filter" - @classmethod def get_options(cls, parent_dataset=None, config=None): """ diff --git a/processors/conversion/hash_images.py b/processors/conversion/hash_images.py index 25a6eb4de..13b0414ca 100644 --- a/processors/conversion/hash_images.py +++ b/processors/conversion/hash_images.py @@ -9,6 +9,7 @@ from backend.lib.processor import BasicProcessor from common.lib.exceptions import ProcessorInterruptedException from common.lib.helpers import UserInput, hash_image, stringify_hash +from common.lib.compatibility import Compatibility from processors.metrics.group_hashes import HashGrouper @@ -28,6 +29,9 @@ class ImageHasher(BasicProcessor): description = "Convert images to text hashes for comparison and similarity detection." # description displayed in UI extension = "csv" + # image datasets: image archives, image-downloader output, or extracted video frames + compatibility = Compatibility(media_types={"image"}, type_prefixes={"image-downloader"}, types={"video-frames"}) + references = [ "[Imagehash library](https://github.com/JohannesBuchner/imagehash?tab=readme-ov-file)", "Explainer: [Perceptual hashing](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)", @@ -94,16 +98,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on image archives - - :param module: Module to determine compatibility with - """ - return module.get_media_type() == "image" or module.type.startswith( - "image-downloader") or module.type == "video-frames" - def process(self): """ Loop through images and hashing them diff --git a/processors/conversion/item_to_annotation.py b/processors/conversion/item_to_annotation.py index dfe17f9b5..bd91871a6 100644 --- a/processors/conversion/item_to_annotation.py +++ b/processors/conversion/item_to_annotation.py @@ -3,6 +3,7 @@ """ from common.lib.exceptions import ProcessorInterruptedException from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Sal Hagen" @@ -23,6 +24,9 @@ class ItemToAnnotation(BasicProcessor): "Explorer. Item values must be numbers or strings.") # description displayed in UI extension = "csv" + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: @@ -43,16 +47,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on NDJSON and CSV files - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def process(self): """ :return generator: diff --git a/processors/conversion/merge_datasets.py b/processors/conversion/merge_datasets.py index 9bec40a63..a2022b57e 100644 --- a/processors/conversion/merge_datasets.py +++ b/processors/conversion/merge_datasets.py @@ -9,6 +9,7 @@ from common.lib.exceptions import ProcessorInterruptedException, DataSetException from common.lib.helpers import UserInput from common.lib.item_mapping import MappedItem +from common.lib.compatibility import Compatibility import ural __author__ = "Stijn Peeters" @@ -29,15 +30,8 @@ class DatasetMerger(BasicProcessor): description = "Merge this dataset with other datasets of the same format. A new dataset is " \ "created containing a combination of items from the original datasets." # description displayed in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on any top-level CSV or NDJSON file - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.get_extension() in ("csv", "ndjson") and (module.is_from_collector()) + # a collector's csv or ndjson output + compatibility = Compatibility(is_collector=True, extensions={"csv", "ndjson"}) @staticmethod def get_dataset_from_url(url, db, modules=None): diff --git a/processors/conversion/ndjson_to_csv.py b/processors/conversion/ndjson_to_csv.py index 75b012f6e..0782e0954 100644 --- a/processors/conversion/ndjson_to_csv.py +++ b/processors/conversion/ndjson_to_csv.py @@ -6,6 +6,7 @@ from common.lib.helpers import flatten_dict from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException __author__ = "Dale Wahl" @@ -26,15 +27,8 @@ class ConvertNDJSONtoCSV(BasicProcessor): "contain nested data." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.get_extension() == "ndjson" + # Allow on NDJSON datasets + compatibility = Compatibility(extensions={"ndjson"}) def process(self): """ diff --git a/processors/conversion/remove_author_info.py b/processors/conversion/remove_author_info.py index f7df341cb..ad1e8daf1 100644 --- a/processors/conversion/remove_author_info.py +++ b/processors/conversion/remove_author_info.py @@ -10,6 +10,7 @@ import csv from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import dict_search_and_update, UserInput, HashCache __author__ = "Stijn Peeters" @@ -30,22 +31,15 @@ class AuthorInfoRemover(BasicProcessor): title = "Pseudonymise or anonymise" # title displayed in UI description = "Removes or replaces data from the dataset in fields identified as containing personal information" + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + references = [ "[What is a hash?](https://techterms.com/definition/hash)", "[What is a salt?](https://en.wikipedia.org/wiki/Salt_(cryptography))", "[What is Blake2?](https://en.wikipedia.org/wiki/BLAKE_(hash_function)#BLAKE2)" ] - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on CSV files - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ["csv", 'ndjson'] - @classmethod def get_options(cls, parent_dataset=None, config=None): options = { diff --git a/processors/conversion/split_by_thread.py b/processors/conversion/split_by_thread.py index 87027ff64..a53f8e55e 100644 --- a/processors/conversion/split_by_thread.py +++ b/processors/conversion/split_by_thread.py @@ -4,6 +4,7 @@ import csv from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -25,16 +26,9 @@ class ThreadSplitter(BasicProcessor): description = "Split the dataset per thread. The result is a ZIP archive containing separate CSV files." # description displayed in UI extension = "zip" # extension of result file, used internally and in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset + # datasets with a thread structure (4chan/8chan, reddit, breitbart) + compatibility = Compatibility(datasources={"fourchan", "eightchan", "reddit", "breitbart"}) - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.parameters.get("datasource") in ("4chan", "8chan", "reddit", "breitbart") - def process(self): """ This takes a 4CAT results file as input, and outputs a new CSV file diff --git a/processors/conversion/stringify.py b/processors/conversion/stringify.py index 3bf1c89b1..95ed4b2d2 100644 --- a/processors/conversion/stringify.py +++ b/processors/conversion/stringify.py @@ -5,6 +5,7 @@ import string from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Sal Hagen" @@ -22,11 +23,14 @@ class Stringify(BasicProcessor): description = "Merges the data from the body column into a single text file. The result can be used for word clouds, word trees, etc." # description displayed in UI extension = "txt" # extension of result file, used internally and in UI + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ Get processor options - + :param parent_dataset DataSet: An object representing the dataset that the processor would be or was run on. Can be used, in conjunction with config, to show some options only to privileged users. @@ -57,17 +61,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility; this processor is only compatible with top datasets in CSV or NDJSON format. - - :param str module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def process(self): """ This takes a 4CAT results file as input, and outputs a plain text file @@ -86,7 +79,7 @@ def process(self): regex += "0-9" if strip_punctuation: regex += string.punctuation - + delete_regex = re.compile("[\n\t" + regex + "]") posts = 0 diff --git a/processors/conversion/tcat_auto_upload.py b/processors/conversion/tcat_auto_upload.py index 2a72d610e..afdc2d521 100644 --- a/processors/conversion/tcat_auto_upload.py +++ b/processors/conversion/tcat_auto_upload.py @@ -9,6 +9,7 @@ from backend.lib.processor import BasicProcessor from common.lib.user_input import UserInput from common.lib.helpers import get_last_line +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl", "Stijn Peeters"] @@ -27,6 +28,9 @@ class FourcatToDmiTcatUploader(BasicProcessor): description = "Send a TCAT-ready JSON file to a particular DMI-TCAT server." # description displayed in UI extension = "html" # extension of result file, used internally and in UI + # the TCAT converter's output, when a TCAT server is configured + compatibility = Compatibility(types={"convert-ndjson-for-tcat"}, required_settings={"tcat-auto-upload.server_url", "tcat-auto-upload.token", "tcat-auto-upload.username", "tcat-auto-upload.password"}) + config = { # TCAT Server Connection Info 'tcat-auto-upload.server_url': { @@ -59,23 +63,6 @@ class FourcatToDmiTcatUploader(BasicProcessor): }, } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - It is if TCAT credentials have been configured and the input is a - TCAT-compatible file. - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "convert-ndjson-for-tcat" and \ - config.get('tcat-auto-upload.server_url') and \ - config.get('tcat-auto-upload.token') and \ - config.get('tcat-auto-upload.username') and \ - config.get('tcat-auto-upload.password') - @classmethod def get_options(cls, parent_dataset=None, config=None): """ diff --git a/processors/conversion/twitter_ndjson_to_tcat_json.py b/processors/conversion/twitter_ndjson_to_tcat_json.py index aecf9e2a3..44f4f0b04 100644 --- a/processors/conversion/twitter_ndjson_to_tcat_json.py +++ b/processors/conversion/twitter_ndjson_to_tcat_json.py @@ -4,6 +4,7 @@ import json from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -20,16 +21,8 @@ class ConvertNDJSONToJSON(BasicProcessor): description = "Convert a Twitter dataset to a TCAT-compatible format. This file can then be uploaded to TCAT." # description displayed in UI extension = "json" # extension of result file, used internally and in UI - followups = ["tcat-auto-upload"] - - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Module to determine compatibility with - """ - return module.type == "twitterv2-search" + # Allow processor on Twitter/X (API v2) datasets + compatibility = Compatibility(types={"twitterv2-search"}, preferred_followups=["tcat-auto-upload"]) def process(self): """ @@ -139,7 +132,6 @@ def map_to_TCAT(self, tweet): 'geo_full' : tweet.get('geo'), } - # Retweet - TCAT checks existance of 'retweeted_status' as key to determine if tweet is a retweet # We instead search for a referenced_tweets with type 'retweeted' # This assumes only one retweet in reference tweets (which has proven true in testing) diff --git a/processors/conversion/upload_annotations.py b/processors/conversion/upload_annotations.py index 942bce24d..958fbfa15 100644 --- a/processors/conversion/upload_annotations.py +++ b/processors/conversion/upload_annotations.py @@ -7,6 +7,7 @@ from flask import g from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException, QueryParametersException, DataSetException from common.lib.helpers import UserInput from common.lib.dataset import DataSet @@ -29,6 +30,9 @@ class UploadAnnotations(BasicProcessor): "For CSV file uploads, comma is used as the separator. For text input, a custom separator can be specified.") extension = "csv" + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -78,16 +82,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on top-level CSV and NDJSON datasets - - :param module: Module to determine compatibility with - :param config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - @staticmethod def validate_query(query, request, config): """ diff --git a/processors/conversion/view_metadata.py b/processors/conversion/view_metadata.py index 3a2436f3b..8d09afb44 100644 --- a/processors/conversion/view_metadata.py +++ b/processors/conversion/view_metadata.py @@ -7,6 +7,7 @@ import zipfile from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.user_input import UserInput __author__ = "Dale Wahl" @@ -27,6 +28,9 @@ class ViewMetadata(BasicProcessor): description = "Reformats the .metadata.json file and calculates analytics" # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow on downloaded media datasets + compatibility = Compatibility(type_prefixes={"video-downloader", "image-downloader"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -47,16 +51,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: }, } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type.startswith("video-downloader") or module.type.startswith("image-downloader") - def process(self): """ Grabs .metadata.json and reformats diff --git a/processors/conversion/vision_api_to_csv.py b/processors/conversion/vision_api_to_csv.py index d2d733e04..578563156 100644 --- a/processors/conversion/vision_api_to_csv.py +++ b/processors/conversion/vision_api_to_csv.py @@ -4,6 +4,7 @@ import csv from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Stijn Peeters" @@ -29,6 +30,9 @@ class ConvertVisionOutputToCSV(BasicProcessor): "to the original dataset.") # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow processor on Google Vision API output + compatibility = Compatibility(types={"google-vision-api"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -49,16 +53,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "google-vision-api" - def process(self): """ This takes the NDJSON file as input and writes the same data as a CSV file diff --git a/processors/filtering/accent_fold.py b/processors/filtering/accent_fold.py index ee8d1f380..aed4de800 100644 --- a/processors/filtering/accent_fold.py +++ b/processors/filtering/accent_fold.py @@ -6,6 +6,7 @@ from unidecode import unidecode from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Stijn Peeters" @@ -27,16 +28,8 @@ class AccentFoldingFilter(BasicProcessor): "'á' to 'a', 'ç' to 'c', etc. This creates a new dataset.") extension = "csv" # extension of result file, used internally and in UI - - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on iterable files - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ["csv"] + # Allow on top-level CSV datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv"}) def process(self): """ @@ -105,7 +98,7 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: :param config ConfigManager|None config: Configuration reader (context-aware) :return dict: Options for this processor """ - + options = { "mode": { "help": "What to replace?", diff --git a/processors/filtering/base_filter.py b/processors/filtering/base_filter.py index 080686129..30168ccdb 100644 --- a/processors/filtering/base_filter.py +++ b/processors/filtering/base_filter.py @@ -6,6 +6,7 @@ import json from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -26,15 +27,8 @@ class BaseFilter(BasicProcessor): item_ids = [] - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - This is meant to be inherited by other child classes - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return False + # Abstract base filter; not runnable on its own (empty type set never matches) + compatibility = Compatibility(types=set()) def process(self): """ diff --git a/processors/filtering/column_filter.py b/processors/filtering/column_filter.py index 8b0729d9c..9e1fb83c3 100644 --- a/processors/filtering/column_filter.py +++ b/processors/filtering/column_filter.py @@ -7,6 +7,7 @@ from backend.lib.processor import BasicProcessor from processors.filtering.base_filter import BaseFilter from common.lib.helpers import UserInput, convert_to_int +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters", "Dale Wahl"] @@ -24,16 +25,8 @@ class ColumnFilter(BaseFilter): description = ("A flexible and customizable filter that lets you retain items in selected column that match a " "custom requirement. This creates a new dataset.") - - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on top datasets that are CSV or NDJSON. - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") + # top-level csv/ndjson datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: @@ -264,15 +257,8 @@ class ColumnProcessorFilter(ColumnFilter): title = "Filter by value" # title displayed in UI description = "A generic filter that checks whether a value in a selected column matches a custom requirement. " - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on child datasets and do not create a standalone dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager config: Configuration reader (context-aware) - """ - return not module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") + # child (non-top-level) csv/ndjson datasets + compatibility = Compatibility(child_only=True, extensions={"csv", "ndjson"}) @classmethod def is_filter(cls): diff --git a/processors/filtering/date_filter.py b/processors/filtering/date_filter.py index ce8046066..b82e31ff6 100644 --- a/processors/filtering/date_filter.py +++ b/processors/filtering/date_filter.py @@ -6,6 +6,7 @@ from datetime import datetime from processors.filtering.base_filter import BaseFilter +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput from common.lib.exceptions import QueryParametersException @@ -23,6 +24,9 @@ class DateFilter(BaseFilter): category = "Filtering" # category title = "Filter by date" # title displayed in UI description = "Retains posts between given dates. This creates a new dataset." + + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: @@ -56,16 +60,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on NDJSON and CSV files - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def filter_items(self): """ Create a generator to iterate through items that can be passed to create either a csv or ndjson diff --git a/processors/filtering/lexical_filter.py b/processors/filtering/lexical_filter.py index 4fe924220..4c0129f65 100644 --- a/processors/filtering/lexical_filter.py +++ b/processors/filtering/lexical_filter.py @@ -4,6 +4,7 @@ import re from processors.filtering.base_filter import BaseFilter +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Stijn Peeters" @@ -22,6 +23,9 @@ class LexicalFilter(BaseFilter): description = "Retains posts that contain selected words or phrases, including preset word lists. " \ "This creates a new dataset." # description displayed in UI + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + references = [ "[Regex101](https://regex101.com/)" ] @@ -62,16 +66,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on NDJSON and CSV files - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def filter_items(self): """ Create a generator to iterate through items that can be passed to create either a csv or ndjson. Use diff --git a/processors/filtering/random_filter.py b/processors/filtering/random_filter.py index 48e5e66cc..5b8763fb9 100644 --- a/processors/filtering/random_filter.py +++ b/processors/filtering/random_filter.py @@ -4,6 +4,7 @@ import random from processors.filtering.base_filter import BaseFilter +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput from common.lib.exceptions import QueryParametersException @@ -22,6 +23,9 @@ class RandomFilter(BaseFilter): title = "Random sample" # title displayed in UI description = "Retain a pseudorandom set of posts. This creates a new dataset." # description displayed in UI + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -41,16 +45,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on NDJSON and CSV files - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def filter_items(self): """ Create a generator to iterate through items that can be passed to create either a csv or ndjson. Use diff --git a/processors/filtering/tiktok_refresh.py b/processors/filtering/tiktok_refresh.py index 5de1fe473..a4cb52458 100644 --- a/processors/filtering/tiktok_refresh.py +++ b/processors/filtering/tiktok_refresh.py @@ -6,6 +6,7 @@ from datasources.tiktok_urls.search_tiktok_urls import TikTokScraper from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" @@ -21,15 +22,8 @@ class UpdateTikTok(BasicProcessor): description = "Re-query TikTok URLs to update the dataset, e.g. to refresh video URLs or like counts." extension = "ndjson" - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on NDJSON and CSV files - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["tiktok-search", "tiktok-urls-search"] + # Allow processor on TikTok datasets + compatibility = Compatibility(types={"tiktok-search", "tiktok-urls-search"}) def process(self): """ diff --git a/processors/filtering/unique_filter.py b/processors/filtering/unique_filter.py index b87bdce34..8a82dfc47 100644 --- a/processors/filtering/unique_filter.py +++ b/processors/filtering/unique_filter.py @@ -4,6 +4,7 @@ import json from processors.filtering.base_filter import BaseFilter +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Sal Hagen" @@ -21,15 +22,8 @@ class UniqueFilter(BaseFilter): title = "Filter for unique items" # title displayed in UI description = "Only keeps the first encounter of an item. This creates a new dataset." # description displayed in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on NDJSON and CSV files - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) def filter_items(self): """ diff --git a/processors/filtering/unique_images.py b/processors/filtering/unique_images.py index 482ad3ceb..8f5b448eb 100644 --- a/processors/filtering/unique_images.py +++ b/processors/filtering/unique_images.py @@ -7,6 +7,7 @@ from backend.lib.processor import BasicProcessor from common.lib.exceptions import ProcessorInterruptedException from common.lib.helpers import UserInput, hash_file +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -24,6 +25,9 @@ class UniqueImageFilter(BasicProcessor): description = "Only keeps one instance per image using various detection methods." # description displayed in UI extension = "zip" + # image datasets: image archives, image-downloader output, or extracted video frames + compatibility = Compatibility(media_types={"image"}, type_prefixes={"image-downloader"}, types={"video-frames"}) + references = [ "[Imagehash library](https://github.com/JohannesBuchner/imagehash?tab=readme-ov-file)", "Explainer: [Average hash](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)", @@ -58,16 +62,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on image archives - - :param module: Module to determine compatibility with - """ - return module.get_media_type() == "image" or module.type.startswith( - "image-downloader") or module.type == "video-frames" - def process(self): """ Loop through images and only retain ones that have not been seen yet diff --git a/processors/machine_learning/audio_to_text.py b/processors/machine_learning/audio_to_text.py index 62a5e9458..ed4078ba0 100644 --- a/processors/machine_learning/audio_to_text.py +++ b/processors/machine_learning/audio_to_text.py @@ -7,6 +7,7 @@ from requests.exceptions import ConnectionError from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.dmi_service_manager import DmiServiceManager, DmiServiceManagerException, DsmOutOfMemory from common.lib.exceptions import ProcessorInterruptedException from common.lib.user_input import UserInput @@ -29,7 +30,8 @@ class AudioToText(BasicProcessor): " GPT models (GPT only via API).") # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - followups = [] + # Allow on audio datasets + compatibility = Compatibility(media_types={"audio"}, type_prefixes={"audio-extractor"}) references = [ "[OpenAI Whisper blog](https://openai.com/research/whisper)", @@ -90,13 +92,6 @@ def get_queue_id(cls, remote_id, details, dataset) -> str: # Queue per model/API type return f"{cls.type}-{dataset.parameters.get('model_host', 'local')}" - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on audio archives - """ - return module.get_media_type() == 'audio' or module.type.startswith("audio-extractor") - @classmethod def get_options(cls, parent_dataset=None, config=None): """ diff --git a/processors/machine_learning/blip2_image_caption.py b/processors/machine_learning/blip2_image_caption.py index c515085d9..109d811d2 100644 --- a/processors/machine_learning/blip2_image_caption.py +++ b/processors/machine_learning/blip2_image_caption.py @@ -9,6 +9,7 @@ from common.lib.exceptions import ProcessorInterruptedException from common.lib.user_input import UserInput from common.lib.item_mapping import MappedItem +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -26,8 +27,8 @@ class CategorizeImagesCLIP(BasicProcessor): description = "The BLIP2 model uses a pretrained image encoder combined with an LLM to generate image captions. The model can also be prompted and uses the image plus prompt to generate text responses." # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - # Processors designed to handle input from this Dataset - followups = ["image-text-wall"] + # image datasets (image archives or image-downloader output), when BLIP2 is enabled + compatibility = Compatibility(media_types={"image"}, type_prefixes={"image-downloader"}, required_settings={"dmi-service-manager.fc_blip2_enabled", "dmi-service-manager.ab_server_address"}, preferred_followups=["image-text-wall"]) references = [ "[OpenAI CLIP blog](https://openai.com/research/clip)", @@ -67,15 +68,6 @@ def get_queue_id(cls, remote_id, details, dataset) -> str: # Unique queue for locally hosted models; used by other local model processors as well return "local_models" - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on image archives if enabled in Control Panel - """ - return config.get("dmi-service-manager.fc_blip2_enabled", False) and \ - config.get("dmi-service-manager.ab_server_address", False) and \ - (module.get_media_type() == "image" or module.type.startswith("image-downloader")) - @classmethod def get_options(cls, parent_dataset=None, config=None): """ diff --git a/processors/machine_learning/clarifai_api.py b/processors/machine_learning/clarifai_api.py index aad832e91..1b6dfaab1 100644 --- a/processors/machine_learning/clarifai_api.py +++ b/processors/machine_learning/clarifai_api.py @@ -10,6 +10,7 @@ from common.lib.helpers import UserInput, convert_to_int from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -33,7 +34,8 @@ class ClarifaiAPIFetcher(BasicProcessor): "requests will be credited by Clarifai to the owner of the API token you provide." # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - followups = ["convert-clarifai-vision-to-csv", "clarifai-bipartite-network"] + # Allow on image sets + compatibility = Compatibility(media_types={"image"}, type_prefixes={"image-downloader"}, types={"video-frames"}, preferred_followups=["convert-clarifai-vision-to-csv", "clarifai-bipartite-network"]) references = [ "[Clarifai](https://www.clarifai.com/)", @@ -41,16 +43,6 @@ class ClarifaiAPIFetcher(BasicProcessor): "[Clarifai model browser](https://clarifai.com/clarifai/main/models)" ] - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on image sets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.get_media_type() == "image" or module.type.startswith("image-downloader") or module.type == "video-frames" - @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ diff --git a/processors/machine_learning/clip_categorize_images.py b/processors/machine_learning/clip_categorize_images.py index 30d29e463..99aa084d1 100644 --- a/processors/machine_learning/clip_categorize_images.py +++ b/processors/machine_learning/clip_categorize_images.py @@ -10,6 +10,7 @@ from common.lib.exceptions import ProcessorInterruptedException from common.lib.user_input import UserInput from common.lib.item_mapping import MappedItem +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -28,7 +29,8 @@ class CategorizeImagesCLIP(BasicProcessor): "the likelihood an image belongs to a category (total of all category values will be 100%).") # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - followups = ["image-category-wall"] + # image datasets (image archives or image-downloader output), when CLIP is enabled + compatibility = Compatibility(media_types={"image"}, type_prefixes={"image-downloader"}, required_settings={"dmi-service-manager.cc_clip_enabled", "dmi-service-manager.ab_server_address"}, preferred_followups=["image-category-wall"]) references = [ "[OpenAI CLIP blog](https://openai.com/research/clip)", @@ -69,15 +71,6 @@ def get_queue_id(cls, remote_id, details, dataset) -> str: # Unique queue for locally hosted models; used by other local model processors as well return "local_models" - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on image archives if enabled in Control Panel - """ - return config.get("dmi-service-manager.cc_clip_enabled", False) and \ - config.get("dmi-service-manager.ab_server_address", False) and \ - (module.get_media_type() == "image" or module.type.startswith("image-downloader")) - @classmethod def get_options(cls, parent_dataset=None, config=None): """ diff --git a/processors/machine_learning/generate_images.py b/processors/machine_learning/generate_images.py index 7ca0d219e..273d0835c 100644 --- a/processors/machine_learning/generate_images.py +++ b/processors/machine_learning/generate_images.py @@ -11,6 +11,7 @@ from processors.visualisation.download_images import ImageDownloader from common.lib.dmi_service_manager import DmiServiceManager, DmiServiceManagerException, DsmOutOfMemory from common.lib.user_input import UserInput +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -28,7 +29,9 @@ class StableDiffusionImageGenerator(BasicProcessor): description = "Given a list of prompts, generates images using the Stable Diffusion XL image model." # description displayed in UI extension = "zip" # extension of result file, used internally and in UI - followups = ImageDownloader.followups + # coarse map spec; is_compatible_with (below) is the runtime truth -- it also requires the + # dataset to have columns (a prompt source), which can't be declared statically + compatibility = Compatibility(required_settings={"dmi-service-manager.sd_enabled", "dmi-service-manager.ab_server_address"}, preferred_followups=ImageDownloader.followups) references = [ "[Stable Diffusion XL 1.0 model card](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)" diff --git a/processors/machine_learning/google_vision_api.py b/processors/machine_learning/google_vision_api.py index 2491967e1..73cae0d2a 100644 --- a/processors/machine_learning/google_vision_api.py +++ b/processors/machine_learning/google_vision_api.py @@ -10,6 +10,7 @@ from common.lib.helpers import UserInput, convert_to_int from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException __author__ = "Stijn Peeters" @@ -35,23 +36,14 @@ class GoogleVisionAPIFetcher(BasicProcessor): "and Google Vision API enabled (this may take a few minutes)." # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - followups = ["convert-google-vision-to-csv", "vision-bipartite-network", "vision-label-network"] + # Allow on image sets + compatibility = Compatibility(media_types={"image"}, type_prefixes={"image-downloader"}, types={"video-frames"}, preferred_followups=["convert-google-vision-to-csv", "vision-bipartite-network", "vision-label-network"]) references = [ "[Google Vision API Documentation](https://cloud.google.com/vision/docs)", "[Google Vision API Pricing & Free Usage Limits](https://cloud.google.com/vision/pricing)" ] - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on image sets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.get_media_type() == "image" or module.type.startswith("image-downloader") or module.type == "video-frames" - @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ diff --git a/processors/machine_learning/llm_prompter.py b/processors/machine_learning/llm_prompter.py index c2bd0d02e..5b823daa4 100644 --- a/processors/machine_learning/llm_prompter.py +++ b/processors/machine_learning/llm_prompter.py @@ -18,6 +18,7 @@ from common.lib.helpers import UserInput, nthify, andify, remove_nuls, flatten_dict from common.lib.llm import LLMAdapter from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility class LLMPrompter(BasicProcessor): """ @@ -30,6 +31,10 @@ class LLMPrompter(BasicProcessor): "entity extraction, or OCR. Supported APIs include OpenAI, Google, Anthropic, Mistral, and DeepSeek.") extension = "ndjson" # extension of result file, used internally and in UI. In this case it's variable! + # coarse map spec; is_compatible_with (below) is the runtime truth -- it accepts csv/ndjson + # tables, OR zip archives of image/video/audio media (_almost_ all zips but not) + compatibility = Compatibility(extensions={"csv", "ndjson", "zip"}) + references = [ "[Törnberg, Petter. 2023. 'How to Use LLMs for Text Analysis.' arXiv:2307.13106.](https://arxiv.org/pdf/2307." "13106)", diff --git a/processors/machine_learning/perspective.py b/processors/machine_learning/perspective.py index f8d1467fd..4b88035d3 100644 --- a/processors/machine_learning/perspective.py +++ b/processors/machine_learning/perspective.py @@ -9,6 +9,7 @@ from backend.lib.processor import BasicProcessor from googleapiclient import discovery from common.lib.item_mapping import MappedItem +from common.lib.compatibility import Compatibility class Perspective(BasicProcessor): """ @@ -21,6 +22,9 @@ class Perspective(BasicProcessor): "including 'toxicity', 'insult', and 'profanity'.") # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI + # top-level text datasets (scores text columns via the Perspective API) + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + references = [ "[Perspective API documentation](https://developers.perspectiveapi.com/s/about-the-api)", "[Rieder, Bernhard, and Yarden Skop. 2021. 'The fabrics of machine moderation: Studying the technical, " diff --git a/processors/machine_learning/pix-plot.py b/processors/machine_learning/pix-plot.py index 4668ff62f..7d0a6b991 100644 --- a/processors/machine_learning/pix-plot.py +++ b/processors/machine_learning/pix-plot.py @@ -13,6 +13,7 @@ from common.lib.dmi_service_manager import DmiServiceManager, DsmOutOfMemory, DmiServiceManagerException from common.lib.helpers import UserInput, ellipsiate from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -33,7 +34,8 @@ class PixPlotGenerator(BasicProcessor): "algorithmically grouped by similarity." extension = "html" # extension of result file, used internally and in UI - followups = [] + # image datasets (image archives or image-downloader output), when PixPlot is enabled + compatibility = Compatibility(media_types={"image"}, type_prefixes={"image-downloader"}, required_settings={"dmi-service-manager.db_pixplot_enabled", "dmi-service-manager.ab_server_address"}) references = [ "[PixPlot](https://pixplot.io/)", @@ -145,18 +147,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on token sets; - Checks if pix-plot.server_url set - - :param module: Dataset or processor to determine compatibility with - """ - return config.get("dmi-service-manager.db_pixplot_enabled", False) and \ - config.get("dmi-service-manager.ab_server_address", False) and \ - (module.get_media_type() == "image" or module.type.startswith("image-downloader")) - def process(self): """ This takes a 4CAT results file as input, copies the images to a temp diff --git a/processors/machine_learning/prompt_compass.py b/processors/machine_learning/prompt_compass.py index 076bd916f..8fa6ef3f7 100644 --- a/processors/machine_learning/prompt_compass.py +++ b/processors/machine_learning/prompt_compass.py @@ -4,6 +4,7 @@ from backend.lib.preset import ProcessorPreset from common.lib.helpers import UserInput from common.lib.llm import LLMAdapter +from common.lib.compatibility import Compatibility from common.lib.exceptions import ( QueryParametersException, @@ -25,6 +26,10 @@ class PromptCompassRunner(ProcessorPreset): "original dataset as a new column.") extension = "ndjson" + # coarse map spec; is_compatible_with (below) is the runtime truth -- it also checks + # that LLM models are configured (get_available_models) + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + references = [ "This processor is an implementation of the stand-alone tool [PromptCompass](https://github.com/ErikBorra/PromptCompass) by Erik Borra.", "See the processor options for references to the sources of each prompt in the library." @@ -82,8 +87,8 @@ def get_available_models(config): return models - @staticmethod - def is_compatible_with(module=None, config=None): + @classmethod + def is_compatible_with(cls, module=None, config=None): """ Determine compatibility diff --git a/processors/machine_learning/text_from_image.py b/processors/machine_learning/text_from_image.py index 8dc7efe9a..0edce68a9 100644 --- a/processors/machine_learning/text_from_image.py +++ b/processors/machine_learning/text_from_image.py @@ -13,6 +13,7 @@ from backend.lib.processor import BasicProcessor from common.lib.exceptions import ProcessorInterruptedException from common.lib.item_mapping import MappedItem +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -37,8 +38,8 @@ class ImageTextDetector(BasicProcessor): """ extension = "ndjson" # extension of result file, used internally and in UI - # Processors designed to handle input from this Dataset - followups = ["image-text-wall"] + # image datasets (image archives or image-downloader output), when the OCR server is enabled + compatibility = Compatibility(media_types={"image"}, type_prefixes={"image-downloader"}, required_settings={"dmi-service-manager.eb_ocr_enabled", "dmi-service-manager.ab_server_address"}, preferred_followups=["image-text-wall"]) references = [ "[DMI OCR Server](https://github.com/digitalmethodsinitiative/ocr_server#readme)", @@ -102,17 +103,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: # }, } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on image sets - - :param module: Module to determine compatibility with - """ - return config.get('dmi-service-manager.eb_ocr_enabled', False) and \ - config.get("dmi-service-manager.ab_server_address", False) and \ - (module.get_media_type() == "image" or module.type.startswith("image-downloader")) - def process(self): """ This takes a 4CAT zip file of images, and outputs a NDJSON file with the diff --git a/processors/metrics/annotation_metadata.py b/processors/metrics/annotation_metadata.py index d57a41e48..21fb091ba 100644 --- a/processors/metrics/annotation_metadata.py +++ b/processors/metrics/annotation_metadata.py @@ -3,6 +3,7 @@ """ from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from datetime import datetime @@ -17,6 +18,10 @@ class AnnotationMetadata(BasicProcessor): "Includes annotation author, timestamp, type, etc.") # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # coarse map spec (accepts any dataset); is_compatible_with (below) is the runtime + # truth -- it requires the dataset to actually have annotations (annotation_fields) + compatibility = Compatibility() + @classmethod def is_compatible_with(cls, module=None, config=None): """ diff --git a/processors/metrics/count_posts.py b/processors/metrics/count_posts.py index f95c94a54..e30cabe1c 100644 --- a/processors/metrics/count_posts.py +++ b/processors/metrics/count_posts.py @@ -4,6 +4,7 @@ from common.lib.helpers import UserInput, pad_interval, get_interval_descriptor from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -21,18 +22,8 @@ class CountPosts(BasicProcessor): description = "Counts how many items are in the dataset per date (or overall)." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["histogram"] - - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}, preferred_followups=["histogram"]) @classmethod def get_options(cls, parent_dataset=None, config=None): diff --git a/processors/metrics/debate_metrics.py b/processors/metrics/debate_metrics.py index 4e977fdd9..d784fb848 100644 --- a/processors/metrics/debate_metrics.py +++ b/processors/metrics/debate_metrics.py @@ -5,6 +5,7 @@ import time from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Sal Hagen" __credits__ = ["Sal Hagen"] @@ -30,17 +31,8 @@ class DebateMetrics(BasicProcessor): description = "Returns a csv with meta-metrics per thread." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = [] - - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor if dataset is a 'top level' dataset - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.parameters.get("datasource") in ("fourchan", "eightchan", "eightkun") + # chan datasets (thread-level debate metrics) + compatibility = Compatibility(datasources={"fourchan", "eightchan", "eightkun"}) def process(self): """ diff --git a/processors/metrics/group_hashes.py b/processors/metrics/group_hashes.py index 6545a8528..7712aa4fa 100644 --- a/processors/metrics/group_hashes.py +++ b/processors/metrics/group_hashes.py @@ -3,6 +3,7 @@ import imagehash from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException from common.lib.helpers import UserInput, normalize_crhash_components @@ -21,6 +22,9 @@ class HashGrouper(BasicProcessor): description = "Calculate groups of similar hashes from a CSV file." # description displayed in UI extension = "csv" + # Allow processor on image-hasher output (could also work on any CSV with the right fields) + compatibility = Compatibility(types={"image-hasher"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -44,17 +48,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on above hasher output - - Could also allow on any CSV with appropriate fields - - :param module: Module to determine compatibility with - """ - return module.type == "image-hasher" - @staticmethod def compute_groups(hashes, hash_type: str, hash_size: int | None, similarity_pct: float) -> list[int]: """ @@ -158,7 +151,7 @@ def process(self): for item in self.source_dataset.iterate_items(self): if self.interrupted: raise ProcessorInterruptedException("Interrupted while grouping hashes") - + row = dict(item) # Discover and enforce a single hash_type row_hash_type = row.get("hash_type") diff --git a/processors/metrics/most_quoted.py b/processors/metrics/most_quoted.py index c8a83982b..6bb49fe4b 100644 --- a/processors/metrics/most_quoted.py +++ b/processors/metrics/most_quoted.py @@ -5,6 +5,7 @@ import re from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -23,18 +24,8 @@ class QuoteRanker(BasicProcessor): description = "Sort posts by how often they were replied to by other posts in the dataset." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = [] - - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on chan datasets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.parameters.get("datasource") in ("fourchan", "eightchan", "eightkun") - + # chan datasets (posts reply to / quote each other) + compatibility = Compatibility(datasources={"fourchan", "eightchan", "eightkun"}) def process(self): """ diff --git a/processors/metrics/rank_attribute.py b/processors/metrics/rank_attribute.py index 2f50d623a..62fc7f2a6 100644 --- a/processors/metrics/rank_attribute.py +++ b/processors/metrics/rank_attribute.py @@ -8,6 +8,7 @@ from itertools import islice, chain from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput, convert_to_int, get_interval_descriptor __author__ = "Stijn Peeters" @@ -30,23 +31,13 @@ class AttributeRanker(BasicProcessor): description = "Count values in a dataset column, like URLs or hashtags (overall or per timeframe)" # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = [] + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) references = ["[regex010](https://regex101.com/)"] include_missing_data = True - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on all csv and NDJSON datasets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - - return module.get_extension() in ("csv", "ndjson") - @classmethod def get_options(cls, parent_dataset=None, config=None): diff --git a/processors/metrics/thread_metadata.py b/processors/metrics/thread_metadata.py index d2f0f8747..86b487d5a 100644 --- a/processors/metrics/thread_metadata.py +++ b/processors/metrics/thread_metadata.py @@ -5,6 +5,7 @@ import math from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Sal Hagen" __credits__ = ["Sal Hagen"] @@ -25,18 +26,8 @@ class ThreadMetadata(BasicProcessor): ) # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = [] - - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) def process(self): """ diff --git a/processors/metrics/top_images.py b/processors/metrics/top_images.py index 7698e77ce..a6d5a8d87 100644 --- a/processors/metrics/top_images.py +++ b/processors/metrics/top_images.py @@ -6,6 +6,7 @@ from collections import Counter, OrderedDict from backend.lib.processor import BasicProcessor from common.lib.helpers import UserInput +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -25,18 +26,8 @@ class TopImageCounter(BasicProcessor): description = "Collect all image URLs and sort by most-occurring." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["image-downloader"] - - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - All top-level datasets, excluding Telegram, which has a different image logic - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - - return module.is_top_dataset() and module.type != "telegram-search" and module.get_extension() in ("csv", "ndjson") + # top-level csv/ndjson datasets, except Telegram (which has its own image logic) + compatibility = Compatibility(top_dataset_only=True, excluded_types={"telegram-search"}, extensions={"csv", "ndjson"}, preferred_followups=["image-downloader"]) @classmethod def get_options(cls, parent_dataset=None, config=None): diff --git a/processors/metrics/url_titles.py b/processors/metrics/url_titles.py index 13c9c90a1..a50abec33 100644 --- a/processors/metrics/url_titles.py +++ b/processors/metrics/url_titles.py @@ -4,6 +4,7 @@ import csv from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from backend.lib.proxied_requests import FailedProxiedRequest from common.lib.helpers import UserInput from common.lib.exceptions import ProcessorInterruptedException @@ -31,7 +32,8 @@ class URLFetcher(BasicProcessor): "each URL, optionally following HTTP redirects.") # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = [] + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) config = { "url-metadata.timeout": { @@ -43,17 +45,6 @@ class URLFetcher(BasicProcessor): } } - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - @classmethod def get_options(cls, parent_dataset=None, config=None): """ diff --git a/processors/metrics/vocabulary_overtime.py b/processors/metrics/vocabulary_overtime.py index 8aa5e634a..eae9b2712 100644 --- a/processors/metrics/vocabulary_overtime.py +++ b/processors/metrics/vocabulary_overtime.py @@ -4,6 +4,7 @@ import re from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput, get_interval_descriptor __author__ = "Stijn Peeters" @@ -22,7 +23,8 @@ class OvertimeAnalysis(BasicProcessor): description = "Determines the counts over time of particular set of words or phrases." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = [] + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) references = [ "[\"Salvaging the Internet Hate Machine: Using the discourse of radical online subcultures to identify emergent extreme speech\" - Unblished paper detailing the OILab extreme speech lexigon](https://oilab.eu/texts/4CAT_Hate_Speech_WebSci_paper.pdf)", @@ -78,17 +80,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def process(self): """ Reads a CSV file, counts occurrences of chosen values over all posts, diff --git a/processors/metrics/youtube_metadata.py b/processors/metrics/youtube_metadata.py index e38b884b5..6dacf0fe3 100644 --- a/processors/metrics/youtube_metadata.py +++ b/processors/metrics/youtube_metadata.py @@ -11,6 +11,7 @@ from backend.lib.processor import BasicProcessor from common.lib.helpers import UserInput +from common.lib.compatibility import Compatibility __author__ = "Sal Hagen" __credits__ = ["Sal Hagen"] @@ -36,7 +37,8 @@ class YouTubeMetadata(BasicProcessor): "Uses the YouTube API.") # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["youtube-thumbnails"] + # collector output or extract-urls-filter output, as csv/ndjson (may contain youtube links) + compatibility = Compatibility(is_collector=True, types={"extract-urls-filter"}, extensions={"csv", "ndjson"}, preferred_followups=["youtube-thumbnails"]) max_retries = 3 sleep_time = 20 @@ -135,18 +137,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on datasets probably containing youtube links - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - # Compatible with every top-level dataset. - return ((module.is_top_dataset() and module.get_extension() in ("csv", "ndjson")) - or module.type == "extract-urls-filter") - def process(self): """ Writes a csv file with metadata of extracted YouTube objects. diff --git a/processors/networks/clarifai_bipartite_network.py b/processors/networks/clarifai_bipartite_network.py index d3d344b38..35bbdcd2f 100644 --- a/processors/networks/clarifai_bipartite_network.py +++ b/processors/networks/clarifai_bipartite_network.py @@ -2,6 +2,7 @@ Google Vision API co-label network """ from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Stijn Peeters" @@ -24,6 +25,9 @@ class VisionTagBiPartiteNetworker(BasicProcessor): "labels if the label occurs for the image with that file name." extension = "gexf" # extension of result file, used internally and in UI + # Allow processor to run on Clarifai API data + compatibility = Compatibility(types={"clarifai-api"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -46,16 +50,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on Google Vision API data - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "clarifai-api" - def process(self): """ Generates a GEXF file-annotation graph. diff --git a/processors/networks/colink_urls.py b/processors/networks/colink_urls.py index 92335c2d4..ff192d150 100644 --- a/processors/networks/colink_urls.py +++ b/processors/networks/colink_urls.py @@ -8,6 +8,7 @@ import psutil from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException from common.lib.helpers import UserInput @@ -33,6 +34,9 @@ class URLCoLinker(BasicProcessor): "Edges are weighted by amount of co-links." # description displayed in UI extension = "gexf" # extension of result file, used internally and in UI + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -67,16 +71,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on top datasets. - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def process(self): """ This takes a 4CAT results file as input, and outputs a new CSV file @@ -87,7 +81,7 @@ def process(self): if self.parameters.get("level") == "thread" and "thread_id" not in self.source_dataset.get_columns(): self.dataset.finish_with_error("Thread-level co-linking requires a 'thread_id' column in the dataset.") return - + # we use these to extract URLs and host names if needed link_regex = re.compile(r"https?://[^\s\]()]+") www_regex = re.compile(r"^www\.") @@ -177,7 +171,7 @@ def process(self): self.dataset.update_status(f"Network has {len(network.nodes)} and {len(network.edges)} edges") self.dataset.log(f"time elapsed: {time.time() - start_time:.2f} seconds") self.dataset.update_status("Writing network file") - + writer = multiprocessing.Process(target=_write_gexf, args=(network, self.dataset.get_results_path())) writer.start() while writer.is_alive(): @@ -195,7 +189,7 @@ def process(self): self.dataset.finish_with_error("Network write failed") self.log.warning(f"Network writer exited with code {writer.exitcode} for dataset {self.dataset.key}") return - + self.dataset.log(f"time to complete: {time.time() - start_time:.2f} seconds") self.dataset.finish(len(network.nodes)) diff --git a/processors/networks/cotag_network.py b/processors/networks/cotag_network.py index 2464e3dd8..6e4ec3816 100644 --- a/processors/networks/cotag_network.py +++ b/processors/networks/cotag_network.py @@ -4,6 +4,7 @@ from backend.lib.preset import ProcessorPreset from common.lib.helpers import UserInput +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -26,6 +27,9 @@ class CoTaggerPreset(ProcessorPreset): possible_tag_columns = {"tags", "hashtags", "groups"} + # datasets with at least one tag-like column + compatibility = Compatibility(requires_any_columns=possible_tag_columns) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -53,17 +57,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on datasets containing a tags column - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - columns = module.get_columns() - return bool(set(columns) & cls.possible_tag_columns) if columns else False - def get_processor_pipeline(self): """ Generate co-tag graph of items diff --git a/processors/networks/coword_network.py b/processors/networks/coword_network.py index ee5c8dbc8..14be06c44 100644 --- a/processors/networks/coword_network.py +++ b/processors/networks/coword_network.py @@ -3,6 +3,7 @@ """ from backend.lib.preset import ProcessorPreset +from common.lib.compatibility import Compatibility __author__ = "Sal Hagen" __credits__ = ["Sal Hagen"] @@ -22,15 +23,8 @@ class CowordNetworker(ProcessorPreset): "amount of co-word occurrences." # description displayed in UI extension = "gexf" # extension of result file, used internally and in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on collocations - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "collocations" + # Allow processor to run on collocations + compatibility = Compatibility(types={"collocations"}) def get_processor_pipeline(self): """ diff --git a/processors/networks/gexf_to_csv.py b/processors/networks/gexf_to_csv.py index 609bb93ae..729ca7b8b 100644 --- a/processors/networks/gexf_to_csv.py +++ b/processors/networks/gexf_to_csv.py @@ -2,6 +2,7 @@ Convert a GEXF network file to a CSV file """ from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility import networkx as nx import csv @@ -23,14 +24,8 @@ class GexfToCsv(BasicProcessor): description = "Convert a GEXF network file to a CSV spreadsheet" extension = "csv" - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on all csv and NDJSON datasets - - :param module: Module to determine compatibility with - """ - return module.get_extension() in ["gexf"] + # Allow on GEXF datasets + compatibility = Compatibility(extensions={"gexf"}) def process(self): """ @@ -58,7 +53,7 @@ def process(self): result.update({"target": target}) result.update({f"target_{k}": v for k,v in target_attributes.items()}) result.update({f"edge_{attr_key}": edge_attributes[attr_key] for attr_key in sorted(edge_attributes, key=lambda k: k == "id", reverse=True)}) - + if writer is False: # Write header # Notes: this assumes that all nodes have the same attributes which ought to be True for GEXF files written by 4CAT diff --git a/processors/networks/google_vision_bipartite_network.py b/processors/networks/google_vision_bipartite_network.py index df48d5321..aef0f3c19 100644 --- a/processors/networks/google_vision_bipartite_network.py +++ b/processors/networks/google_vision_bipartite_network.py @@ -2,6 +2,7 @@ Google Vision API co-label network """ from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput from common.lib.exceptions import ProcessorInterruptedException @@ -25,6 +26,9 @@ class VisionTagBiPartiteNetworker(BasicProcessor): "labels if the label occurs for the image with that file name." extension = "gexf" # extension of result file, used internally and in UI + # Allow processor to run on Google Vision API data + compatibility = Compatibility(types={"google-vision-api"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -61,16 +65,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on Google Vision API data - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "google-vision-api" - def process(self): """ Generates a GEXF file-annotation graph. diff --git a/processors/networks/google_vision_network.py b/processors/networks/google_vision_network.py index 258a3e975..9ddf0da8d 100644 --- a/processors/networks/google_vision_network.py +++ b/processors/networks/google_vision_network.py @@ -2,6 +2,7 @@ Google Vision API co-label network """ from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput from common.lib.exceptions import ProcessorInterruptedException @@ -25,6 +26,9 @@ class VisionTagNetworker(BasicProcessor): "edges." extension = "gexf" # extension of result file, used internally and in UI + # Allow processor to run on Google Vision API data + compatibility = Compatibility(types={"google-vision-api"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -60,16 +64,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on Google Vision API data - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "google-vision-api" - def process(self): """ Generates a GDF co-annotation graph. diff --git a/processors/networks/hash_similarity_network.py b/processors/networks/hash_similarity_network.py index 997c2a8a4..845c91814 100644 --- a/processors/networks/hash_similarity_network.py +++ b/processors/networks/hash_similarity_network.py @@ -7,6 +7,7 @@ import numpy as np from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorException from common.lib.helpers import UserInput @@ -27,6 +28,9 @@ class HashSimilarityNetworker(BasicProcessor): description = "Calculate similarity of hashes and create a GEXF network file. Can identify near duplicate hashes." extension = "gexf" + # Currently only allowed on video-hashes, though any row of bit hashes would work. + compatibility = Compatibility(types={"video-hashes"}) + @classmethod def get_options(cls, parent_dataset=None, config=None): """ @@ -67,14 +71,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Currently only allowed on video-hashes, but technically any row of bit hashes will work. Could check for "hash" - in columns, but... how to make that a check as a classmethod? - """ - return module.type == "video-hashes" - def process(self): """ Takes a list of bit hashes and compares them. Then makes network file. diff --git a/processors/networks/image-network.py b/processors/networks/image-network.py index ab71e864d..ee573604b 100644 --- a/processors/networks/image-network.py +++ b/processors/networks/image-network.py @@ -15,6 +15,7 @@ from common.lib.exceptions import ProcessorInterruptedException from common.lib.user_input import UserInput +from common.lib.compatibility import Compatibility class ImageGrapher(BasicProcessor): @@ -33,6 +34,9 @@ class ImageGrapher(BasicProcessor): "'Image Preview' plugin.") extension = "gexf" # extension of result file, used internally and in UI + # coarse map spec; is_compatible_with (below) is the runtime truth -- it also walks the + # genealogy to find an image-downloader root (get_root_dataset) + compatibility = Compatibility(type_prefixes={"image-downloader"}) @classmethod def get_options(cls, parent_dataset=None, config=None): diff --git a/processors/networks/quote_network.py b/processors/networks/quote_network.py index 0ad478fa7..98ebf2979 100644 --- a/processors/networks/quote_network.py +++ b/processors/networks/quote_network.py @@ -4,6 +4,7 @@ import re from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility import networkx as nx @@ -25,16 +26,9 @@ class QuoteNetworkGrapher(BasicProcessor): "Each reference to another post creates an edge between posts. " # description displayed in UI extension = "gexf" # extension of result file, used internally and in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on chan datasets + # chan datasets (posts reply to / quote each other) + compatibility = Compatibility(datasources={"fourchan", "eightchan", "eightkun"}) - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.parameters.get("datasource") in ("fourchan", "eightchan", "eightkun") - def process(self): """ This takes a 4CAT results file as input, and outputs a new CSV file diff --git a/processors/networks/two-column-network.py b/processors/networks/two-column-network.py index 5217bf393..f79cb6f85 100644 --- a/processors/networks/two-column-network.py +++ b/processors/networks/two-column-network.py @@ -5,6 +5,7 @@ from functools import partial from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput, get_interval_descriptor import networkx as nx @@ -27,6 +28,9 @@ class ColumnNetworker(BasicProcessor): "(e.g. 'author' and 'subreddit'). Nodes and edges are weighted by frequency." extension = "gexf" + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) + references = [ "Utilises [Networkx](https://networkx.org/)' built-in [Louvain](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.louvain.louvain_communities.html#networkx.algorithms.community.louvain.louvain_communities) and [greedy modularity](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.modularity_max.greedy_modularity_communities.html#networkx.algorithms.community.modularity_max.greedy_modularity_communities) community detection algorithms." ] @@ -139,16 +143,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on all csv and NDJSON datasets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.get_extension() in ("csv", "ndjson") - def process(self): """ This takes a 4CAT results file as input, and creates a network file @@ -226,7 +220,7 @@ def process(self): raise ValueError(f"Date '{item.get('timestamp')}' cannot be parsed") except ValueError as e: return self.dataset.finish_with_error(f"{e}, cannot count posts per {interval_type}") - + # Track nodes per item (categoise option adjusts node name to include column if True) processed_nodes = set() @@ -256,13 +250,13 @@ def process(self): network.nodes[node_a]["intervals"][interval] += 1 processed_nodes.add(node_a) - + if node_b not in processed_nodes: if node_b not in network.nodes(): network.add_node(node_b, intervals={}, frequency=1, label=value_b, **({"category": column_b} if categorise else {})) else: network.nodes[node_b]["frequency"] += 1 - + if interval not in network.nodes[node_b]["intervals"]: network.nodes[node_b]["intervals"][interval] = 0 network.nodes[node_b]["intervals"][interval] += 1 @@ -275,7 +269,7 @@ def process(self): edge = tuple(sorted((node_a, node_b))) else: edge = (node_a, node_b) - + if edge not in processed_edges: if edge not in network.edges(): network.add_edge(node_a, node_b, intervals={}, frequency=1, weight=1) diff --git a/processors/networks/user_hashtag_network.py b/processors/networks/user_hashtag_network.py index 7ec1d11b7..b21366cf7 100644 --- a/processors/networks/user_hashtag_network.py +++ b/processors/networks/user_hashtag_network.py @@ -3,6 +3,7 @@ """ from backend.lib.preset import ProcessorPreset from common.lib.user_input import UserInput +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" @@ -21,6 +22,9 @@ class HashtagUserBipartiteGrapherPreset(ProcessorPreset): description = "Produces a bipartite graph based on co-occurence of (hash)tags and authors. If someone wrote a post with a certain tag, there will be a link between that person and the tag. The more often they appear together, the stronger the link. Tag nodes are weighed on how often they occur. User nodes are weighed on how many posts they've made." # description displayed in UI extension = "gexf" # extension of result file, used internally and in UI + # datasets with at least one tag-like column + compatibility = Compatibility(requires_any_columns={"tags", "hashtags", "groups"}) + @classmethod def get_options(cls, parent_dataset=None, config=None): return { @@ -32,18 +36,6 @@ def get_options(cls, parent_dataset=None, config=None): } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on datasets containing a tags column - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - usable_columns = {"tags", "hashtags", "groups"} - columns = module.get_columns() - return bool(set(columns) & usable_columns) if columns else False - def get_processor_pipeline(self): """ Generate bipartite user-hashtag graph of items diff --git a/processors/networks/wikipedia_network.py b/processors/networks/wikipedia_network.py index 334ef304e..e69a00045 100644 --- a/processors/networks/wikipedia_network.py +++ b/processors/networks/wikipedia_network.py @@ -9,6 +9,7 @@ import networkx as nx from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException __author__ = "Stijn Peeters" @@ -27,15 +28,8 @@ class WikiURLCoLinker(BasicProcessor): description = "Create a GEXF network file comprised network comprised of linked-to Wikipedia pages, linked to the categories they are part of. English Wikipedia only. Will only fetch the first 10,000 links." # description displayed in UI extension = "gexf" # extension of result file, used internally and in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on top datasets. - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) def process(self): """ @@ -66,7 +60,7 @@ def process(self): if not post["body"]: continue - + wiki_links = link_regex.findall(post["body"]) # if the result has an explicit url per post, take that into @@ -147,7 +141,7 @@ def stringify_children(node): # Add " (cat)" to the category strings. # This is needed because pages can sometimes have the same name as the category. # This will result in a faulty graph, since there's duplicate nodes. - + category += " (cat)" if category not in all_categories: diff --git a/processors/presets/annotate-images.py b/processors/presets/annotate-images.py index b8650e698..829c5d8ac 100644 --- a/processors/presets/annotate-images.py +++ b/processors/presets/annotate-images.py @@ -2,6 +2,7 @@ Annotate top images """ from backend.lib.preset import ProcessorPreset +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput, convert_to_int @@ -17,6 +18,9 @@ class AnnotateImages(ProcessorPreset): "this is a paid service and will count towards your API credit." extension = "csv" + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + references = [ "[Google Vision API Documentation](https://cloud.google.com/vision/docs)", "[Google Vision API Pricing & Free Usage Limits](https://cloud.google.com/vision/pricing)" @@ -67,17 +71,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def get_processor_pipeline(self): """ This queues a series of post-processors to annotate images diff --git a/processors/presets/monthly-histogram.py b/processors/presets/monthly-histogram.py index e982aafc7..a67464b5a 100644 --- a/processors/presets/monthly-histogram.py +++ b/processors/presets/monthly-histogram.py @@ -2,6 +2,7 @@ Extract neologisms """ from backend.lib.preset import ProcessorPreset +from common.lib.compatibility import Compatibility from processors.metrics.count_posts import CountPosts @@ -15,19 +16,9 @@ class MonthlyHistogramCreator(ProcessorPreset): description = "Create a histogram that shows the number of items over time." # description displayed in UI extension = "svg" - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - This preset is compatible with any module that has countable items (via count-posts) + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - @classmethod def get_options(cls, parent_dataset=None, config=None): count_options = CountPosts.get_options(parent_dataset=parent_dataset, config=config) diff --git a/processors/presets/neologisms.py b/processors/presets/neologisms.py index 863c70c11..0b1e0179d 100644 --- a/processors/presets/neologisms.py +++ b/processors/presets/neologisms.py @@ -2,6 +2,7 @@ Extract neologisms """ from backend.lib.preset import ProcessorPreset +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput @@ -17,6 +18,9 @@ class NeologismExtractor(ProcessorPreset): "language data. Uses stopwords-iso as a stopword filter.") extension = "csv" + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) + references = [ "Van Soest, Jeroen. 2019. 'Language Innovation Tracker: Detecting language innovation in online discussion fora.' (MA thesis), Beuls, K. (Promotor), Van Eecke, P. (Advisor).'"] @@ -50,17 +54,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on all csv and NDJSON datasets - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - - return module.get_extension() in ("csv", "ndjson") - def get_processor_pipeline(self): """ This queues a series of post-processors to extract neologisms from a diff --git a/processors/presets/similar-words.py b/processors/presets/similar-words.py index 482098ed4..5c172776e 100644 --- a/processors/presets/similar-words.py +++ b/processors/presets/similar-words.py @@ -4,6 +4,7 @@ from nltk.stem.snowball import SnowballStemmer from backend.lib.preset import ProcessorPreset +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput @@ -19,6 +20,9 @@ class SimilarWords(ProcessorPreset): "with large datasets (e.g. 100,000+ items).") extension = "csv" + # Allow on top-level CSV/NDJSON datasets + compatibility = Compatibility(top_dataset_only=True, extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -50,19 +54,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - This preset is compatible with any module that has a "body" column - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def get_processor_pipeline(self): """ This queues a series of post-processors to calculate word similarities diff --git a/processors/presets/top-hashtags.py b/processors/presets/top-hashtags.py index 93a417693..dc79a784e 100644 --- a/processors/presets/top-hashtags.py +++ b/processors/presets/top-hashtags.py @@ -4,6 +4,7 @@ from backend.lib.preset import ProcessorPreset from common.lib.helpers import UserInput from processors.networks.cotag_network import CoTaggerPreset +from common.lib.compatibility import Compatibility class TopHashtags(ProcessorPreset): @@ -16,6 +17,9 @@ class TopHashtags(ProcessorPreset): description = "Count how often each hashtag occurs in the dataset and sort by this value" extension = "csv" + # datasets with at least one tag-like column + compatibility = Compatibility(requires_any_columns=CoTaggerPreset.possible_tag_columns) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -43,18 +47,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: }, } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Check if dataset has a hashtag attribute - - :param module: Dataset to check - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - columns = module.get_columns() - return columns and any([tag in columns for tag in CoTaggerPreset.possible_tag_columns]) - def get_processor_pipeline(self): """ This is basically a 'count values' processor with some defaults diff --git a/processors/presets/upload-to-dmi-tcat.py b/processors/presets/upload-to-dmi-tcat.py index 74f039c6d..7e6004415 100644 --- a/processors/presets/upload-to-dmi-tcat.py +++ b/processors/presets/upload-to-dmi-tcat.py @@ -3,6 +3,7 @@ """ from backend.lib.preset import ProcessorPreset from common.lib.helpers import UserInput +from common.lib.compatibility import Compatibility class FourcatToDmiTcatConverterAndUploader(ProcessorPreset): """ @@ -14,6 +15,9 @@ class FourcatToDmiTcatConverterAndUploader(ProcessorPreset): description = "Convert the dataset to a TCAT-compatible format and upload it to an available TCAT server." # description displayed in UI extension = "html" + # Twitter v2 search results, when a TCAT server is configured + compatibility = Compatibility(types={"twitterv2-search"}, required_settings={"tcat-auto-upload.server_url", "tcat-auto-upload.token", "tcat-auto-upload.username", "tcat-auto-upload.password"}) + @classmethod def get_options(cls, parent_dataset=None, config=None): """ @@ -48,20 +52,6 @@ def get_options(cls, parent_dataset=None, config=None): else: return {} - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "twitterv2-search" and \ - config.get('tcat-auto-upload.server_url') and \ - config.get('tcat-auto-upload.token') and \ - config.get('tcat-auto-upload.username') and \ - config.get('tcat-auto-upload.password') - def get_processor_pipeline(self): """ This queues a series of post-processors to upload a dataset to a diff --git a/processors/presets/video-scene-timelines.py b/processors/presets/video-scene-timelines.py index 579288583..406c2a07e 100644 --- a/processors/presets/video-scene-timelines.py +++ b/processors/presets/video-scene-timelines.py @@ -1,9 +1,9 @@ """ Create scene-by-scene timelines """ -import shutil from backend.lib.preset import ProcessorPreset +from common.lib.compatibility import Compatibility, is_executable class VideoSceneTimelineCreator(ProcessorPreset): @@ -18,20 +18,8 @@ class VideoSceneTimelineCreator(ProcessorPreset): "for all videos are then stacked vertically and rendered as a single SVG file." extension = "svg" - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine compatibility - - Compatible with downloaded videos, and not really anything else! - Additionally ffmpeg needs to be available. - - :param DataSet module: Module ID to determine compatibility with - :return bool: - """ - return (module.get_media_type() == "video" or module.type.startswith("video-downloader")) and \ - config.get("video-downloader.ffmpeg_path") and \ - shutil.which(config.get("video-downloader.ffmpeg_path")) + # Allow on video datasets when ffmpeg is available + compatibility = Compatibility(media_types={"video"}, type_prefixes={"video-downloader"}, required_settings={("video-downloader.ffmpeg_path", is_executable)}) def get_processor_pipeline(self): """ diff --git a/processors/statistics/classification_evaluation.py b/processors/statistics/classification_evaluation.py index 85fcee44c..512e2a67d 100644 --- a/processors/statistics/classification_evaluation.py +++ b/processors/statistics/classification_evaluation.py @@ -5,6 +5,7 @@ from common.lib.exceptions import ProcessorInterruptedException from common.lib.helpers import UserInput, andify from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from sklearn.preprocessing import MultiLabelBinarizer from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, cohen_kappa_score @@ -26,6 +27,9 @@ class ClassificationEvaluation(BasicProcessor): "and Cohen's Kappa). Produces overall and per-label metrics. Also supports multi-label values.") extension = "csv" # extension of result file, used internally and in UI + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: options = { @@ -100,17 +104,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: return options - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.get_extension() in ("csv", "ndjson") - def process(self): skip_empty = self.parameters.get("skip_empty", False) diff --git a/processors/statistics/confusion_matrix.py b/processors/statistics/confusion_matrix.py index 9a33e7c86..00e4c521e 100644 --- a/processors/statistics/confusion_matrix.py +++ b/processors/statistics/confusion_matrix.py @@ -4,6 +4,7 @@ from common.lib.exceptions import ProcessorInterruptedException from common.lib.helpers import UserInput from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay import matplotlib @@ -25,6 +26,9 @@ class ConfusionMatrix(BasicProcessor): description = "Create a confusion matrix with data from two columns." # description displayed in UI extension = "png" # extension of result file, used internally and in UI + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: options = { @@ -51,17 +55,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: return options - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.get_extension() in ("csv", "ndjson") - def process(self): skip_empty = self.parameters.get("skip_empty", False) diff --git a/processors/statistics/descriptive_statistics.py b/processors/statistics/descriptive_statistics.py index 01f3d4b2d..6156ecfac 100644 --- a/processors/statistics/descriptive_statistics.py +++ b/processors/statistics/descriptive_statistics.py @@ -5,6 +5,7 @@ from common.lib.exceptions import ProcessorInterruptedException from common.lib.helpers import UserInput from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility import numpy as np @@ -24,6 +25,9 @@ class DescriptiveStatistics(BasicProcessor): description = "Calculate descriptive statistics (mean, median, std dev, etc.) for numerical columns." extension = "csv" # extension of result file, used internally in UI + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: options = { @@ -51,21 +55,10 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: return options - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.get_extension() in ("csv", "ndjson") - def process(self): skip_empty = self.parameters.get("skip_empty", True) selected_columns = self.parameters.get("columns", []) - + if not selected_columns: self.dataset.finish_with_error("Please select at least one column to analyze") return @@ -87,14 +80,14 @@ def process(self): # First pass: check if we can process this row for col in selected_columns: val = item.get(col, "") - + # Handle empty values if val is None or val == "": if not skip_empty: row_valid = False break continue - + # Try to convert to float try: float_val = float(val) @@ -108,7 +101,7 @@ def process(self): return row_valid = False break - + # Second pass: add valid values to our data structure if row_valid and row_values: for col in selected_columns: @@ -122,15 +115,15 @@ def process(self): # Calculate statistics for each column results = [] - + for column in selected_columns: if not column_data[column]: self.dataset.finish_with_error(f"No valid numerical values found in column '{column}'") return - + # Convert to numpy array for calculations values = np.array(column_data[column]) - + # Calculate statistics stats = {"column": column} diff --git a/processors/statistics/regression-evaluation.py b/processors/statistics/regression-evaluation.py index 69dea0e26..09fc65da9 100644 --- a/processors/statistics/regression-evaluation.py +++ b/processors/statistics/regression-evaluation.py @@ -5,6 +5,7 @@ from common.lib.exceptions import ProcessorInterruptedException from common.lib.helpers import UserInput from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score import numpy as np @@ -25,6 +26,9 @@ class RegressionEvaluation(BasicProcessor): description = "Calculate regression metrics (MAE, MSE, R2, RMSE) between two numerical columns." extension = "csv" # extension of result file, used internally in UI + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: options = { @@ -70,25 +74,14 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: return options - @staticmethod - def is_compatible_with(module=None, config=None): - """ - Determine compatibility - - :param Dataset module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.get_extension() in ("csv", "ndjson") - def process(self): skip_empty = self.parameters.get("skip_empty", True) metrics = self.parameters.get("metrics", ["mae", "mse", "rmse", "r2"]) - + if not metrics: self.dataset.finish_with_error("Please select at least one evaluation metric") return - + # Get which metrics to calculate get_mae = "mae" in metrics get_mse = "mse" in metrics @@ -98,7 +91,7 @@ def process(self): # Parse the column names column_true = self.parameters.get("column_true", "") column_pred = self.parameters.get("column_pred", "") - + if not column_true or not column_pred: self.dataset.finish_with_error("Please specify which columns contain the true and predicted values") return @@ -122,7 +115,7 @@ def process(self): if skip_empty and (true_val is None or pred_val is None or true_val == "" or pred_val == ""): skipped_rows += 1 continue - + # Try to convert to float try: true_float = float(true_val) @@ -142,42 +135,42 @@ def process(self): if not true_values or not pred_values: self.dataset.finish_with_error("No valid numerical values found in the specified columns") return - + if len(true_values) != len(pred_values): self.dataset.finish_with_error("Mismatch in number of true and predicted values") return - + if skipped_rows > 0: self.dataset.update_status(f"Skipped {skipped_rows} rows with missing or invalid values") # Convert to numpy arrays for calculations true_array = np.array(true_values) pred_array = np.array(pred_values) - + # Calculate metrics results = [] - + if get_mae: mae = mean_absolute_error(true_array, pred_array) results.append({ "metric": "MAE", "value": round(mae, 5) }) - + if get_mse: mse = mean_squared_error(true_array, pred_array) results.append({ "metric": "MSE", "value": round(mse, 5) }) - + if get_rmse: rmse = np.sqrt(mean_squared_error(true_array, pred_array)) results.append({ "metric": "RMSE", "value": round(rmse, 5) }) - + if get_r2: r2 = r2_score(true_array, pred_array) results.append({ diff --git a/processors/text-analysis/collocations.py b/processors/text-analysis/collocations.py index 2b9ffe71c..147608334 100644 --- a/processors/text-analysis/collocations.py +++ b/processors/text-analysis/collocations.py @@ -10,6 +10,7 @@ from common.lib.helpers import UserInput from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility class GetCollocations(BasicProcessor): @@ -22,17 +23,8 @@ class GetCollocations(BasicProcessor): description = "Extracts words appearing close to each other from a set of tokens." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["preset-coword-network", "wordcloud"] - - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on token sets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "tokenise-posts" + # Allow processor on token sets + compatibility = Compatibility(types={"tokenise-posts"}, preferred_followups=["preset-coword-network", "wordcloud"]) @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: diff --git a/processors/text-analysis/documents_per_topic.py b/processors/text-analysis/documents_per_topic.py index 370c8d778..80375c339 100644 --- a/processors/text-analysis/documents_per_topic.py +++ b/processors/text-analysis/documents_per_topic.py @@ -3,6 +3,7 @@ """ from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException import json @@ -24,17 +25,8 @@ class TopicModelWordExtractor(BasicProcessor): description = "Uses the LDA model to predict to which topic each item or sentence belongs and counts as belonging to whichever topic has the highest probability." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = [] - - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on topic models - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "topic-modeller" + # Allow processor on topic models + compatibility = Compatibility(types={"topic-modeller"}) def process(self): """ diff --git a/processors/text-analysis/generate_embeddings.py b/processors/text-analysis/generate_embeddings.py index 4dd6cdc7b..ea352895b 100644 --- a/processors/text-analysis/generate_embeddings.py +++ b/processors/text-analysis/generate_embeddings.py @@ -10,6 +10,7 @@ from common.lib.helpers import UserInput, convert_to_int from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException __author__ = "Sal Hagen" @@ -32,7 +33,8 @@ class GenerateWordEmbeddings(BasicProcessor): "Note that good models require a lot of data." # description displayed in UI extension = "zip" # extension of result file, used internally and in UI - followups = ["similar-word2vec", "histwords-vectspace"] + # Allow processor on token sets + compatibility = Compatibility(types={"tokenise-posts"}, preferred_followups=["similar-word2vec", "histwords-vectspace"]) references = [ "word2vec: [Mikolov, Tomas, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. 2013. “Distributed Representations of Words and Phrases and Their Compositionality.” 8Advances in Neural Information Processing Systems*, 2013: 3111-3119.](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)", @@ -114,16 +116,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on token sets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "tokenise-posts" - def process(self): """ This takes a 4CAT results file as input, and outputs a number of files containing diff --git a/processors/text-analysis/post_topic_matrix.py b/processors/text-analysis/post_topic_matrix.py index 4351eac3a..190abcef6 100644 --- a/processors/text-analysis/post_topic_matrix.py +++ b/processors/text-analysis/post_topic_matrix.py @@ -4,6 +4,7 @@ from common.lib.helpers import UserInput from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException import csv @@ -29,7 +30,8 @@ class TopicModelWordExtractor(BasicProcessor): "by multiple rows (for each sentence and/or column used).") # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = [] + # Allow processor on topic models + compatibility = Compatibility(types={"topic-modeller"}) @classmethod def get_options(cls, parent_dataset=None, config=None): @@ -75,16 +77,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on topic models - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "topic-modeller" - def process(self): """ Extracts metadata and connects to original dataset diff --git a/processors/text-analysis/similar_words.py b/processors/text-analysis/similar_words.py index 9aa0996f5..7db6573c6 100644 --- a/processors/text-analysis/similar_words.py +++ b/processors/text-analysis/similar_words.py @@ -7,6 +7,7 @@ from common.lib.helpers import UserInput, convert_to_int, convert_to_float from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException __author__ = "Sal Hagen" @@ -25,7 +26,8 @@ class SimilarWord2VecWords(BasicProcessor): description = "Uses a word2vec model to find words used in a similar context" # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["wordcloud"] + # Allow processor on word embedding models + compatibility = Compatibility(types={"generate-embeddings"}, preferred_followups=["wordcloud"]) flawless = True @@ -67,16 +69,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on word embedding models - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "generate-embeddings" - def process(self): """ This takes previously generated Word2Vec models and uses them to find diff --git a/processors/text-analysis/split_sentences.py b/processors/text-analysis/split_sentences.py index 35a9016b5..e5a5cc712 100644 --- a/processors/text-analysis/split_sentences.py +++ b/processors/text-analysis/split_sentences.py @@ -6,6 +6,7 @@ from common.lib.helpers import UserInput from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -23,7 +24,8 @@ class SplitSentences(BasicProcessor): description = "Split a body of posts into discrete sentences. Output file has one row per sentence, containing the sentence and item ID." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = [] + # Allow on CSV/NDJSON datasets + compatibility = Compatibility(extensions={"csv", "ndjson"}) @classmethod def get_options(cls, parent_dataset=None, config=None): @@ -85,17 +87,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on all csv and NDJSON datasets - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - - return module.get_extension() in ("csv", "ndjson") - def process(self): """ This takes a 4CAT results file as input, and outputs a number of files containing diff --git a/processors/text-analysis/tf_idf.py b/processors/text-analysis/tf_idf.py index 7fc3aba80..4e7378c4f 100644 --- a/processors/text-analysis/tf_idf.py +++ b/processors/text-analysis/tf_idf.py @@ -9,6 +9,7 @@ from common.lib.helpers import UserInput, convert_to_int from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from sklearn.feature_extraction.text import TfidfVectorizer from gensim.models import TfidfModel @@ -31,7 +32,15 @@ class TfIdf(BasicProcessor): description = "Get the tf-idf values of tokenised text. Works better with more documents (e.g. time-separated)." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["wordcloud"] + # Allow processor on token sets + compatibility = Compatibility( + types={"tokenise-posts"}, + preferred_followups=["wordcloud"], + excluded_followups=[ + "consolidate-urls", "preset-neologisms", "sentence-split", "tokenise-posts", + "image-downloader-stable-diffusion", "word-trees", "histogram", "extract-urls-filter", + ], + ) references = [ "[Spärck Jones, Karen. 1972. \"A statistical interpretation of term specificity and its application in retrieval.\" *Journal of Documentation* (28), 1: 11–21.](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.115.8343&rep=rep1&type=pdf)", @@ -105,16 +114,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on token sets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "tokenise-posts" - def process(self): """ Unzips and appends tokens to fetch and write a tf-idf matrix @@ -173,7 +172,7 @@ def process(self): if max_occurrences <= 0 or max_occurrences > len(tokens): max_occurrences = len(tokens) self.dataset.log(f"Running tf-idf with library {library}, n_size {n_size}, min_occurrences {min_occurrences}, max_occurrences {max_occurrences}, max_output {max_output}, smartirs {smartirs}") - + # Get the tf-idf matrix. self.dataset.update_status("Generating tf-idf for token set") try: @@ -306,12 +305,3 @@ def get_tfidf_sklearn(self, tokens, dates, ngram_range=(1, 1), min_occurrences=0 results.append(result) return results - - @classmethod - def exclude_followup_processors(cls, processor_type): - """ - Exclude followups if they are not compatible with the module - """ - if processor_type in ["consolidate-urls", "preset-neologisms", "sentence-split", "tokenise-posts", "image-downloader-stable-diffusion", "word-trees", "histogram", "extract-urls-filter"]: - return True - return False diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py index 58f45f6a2..c95fd8928 100644 --- a/processors/text-analysis/tokenise.py +++ b/processors/text-analysis/tokenise.py @@ -17,6 +17,7 @@ from common.lib.helpers import UserInput, get_interval_descriptor from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = ["Stijn Peeters", "Sal Hagen"] __credits__ = ["Stijn Peeters", "Sal Hagen"] @@ -36,7 +37,7 @@ class Tokenise(BasicProcessor): "tokens per sentence." # description displayed in UI extension = "zip" # extension of result file, used internally and in UI - followups = ["collocations", "vectorise-tokens", "generate-embeddings", "tfidf", "topic-modeller", ] + compatibility = Compatibility(extensions={"csv", "ndjson"}, preferred_followups=["collocations", "vectorise-tokens", "generate-embeddings", "tfidf", "topic-modeller", ]) references = [ "[NLTK tokenizer documentation](https://www.nltk.org/api/nltk.tokenize.html)", @@ -47,17 +48,6 @@ class Tokenise(BasicProcessor): "[Words in OpenTaal word list](https://github.com/OpenTaal/opentaal-wordlist)" ] - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on all csv and NDJSON datasets - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - - return module.get_extension() in ("csv", "ndjson") - @classmethod def get_options(cls, parent_dataset=None, config=None): """ diff --git a/processors/text-analysis/top_vectors.py b/processors/text-analysis/top_vectors.py index 0a971c7a3..a04d33d3c 100644 --- a/processors/text-analysis/top_vectors.py +++ b/processors/text-analysis/top_vectors.py @@ -7,6 +7,7 @@ from common.lib.helpers import UserInput, convert_to_int from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -24,7 +25,8 @@ class VectorRanker(BasicProcessor): "Limited to 100 most-used tokens." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["wordcloud"] + # Allow processor on token vectors + compatibility = Compatibility(types={"vectorise-tokens"}, preferred_followups=["wordcloud"]) @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: @@ -52,16 +54,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: }, } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on token vectors - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "vectorise-tokens" - def process(self): """ Reads vector set and creates a CSV with ranked vectors @@ -108,7 +100,7 @@ def file_to_timestamp(file): vectors = vector_unpacker.load(binary_tokens) vectors = sorted(vectors, key=lambda x: x[1], reverse=True) - + # for overall ranking we need the full vector space per interval # because maybe an overall top-ranking vector is at the bottom # in this particular interval - we'll truncate the top list at diff --git a/processors/text-analysis/topic_modeling.py b/processors/text-analysis/topic_modeling.py index b41e55422..d9070ab67 100644 --- a/processors/text-analysis/topic_modeling.py +++ b/processors/text-analysis/topic_modeling.py @@ -4,6 +4,7 @@ from common.lib.helpers import UserInput from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException import json @@ -31,12 +32,14 @@ class TopicModeler(BasicProcessor): "which can be used to find clusters of related words." # description displayed in UI extension = "zip" # extension of result file, used internally and in UI - followups = ["document_count", "document_topic_matrix", "topic-model-words"] + # Allow processor on token sets + compatibility = Compatibility(types={"tokenise-posts"}, preferred_followups=["document_count", "document_topic_matrix", "topic-model-words"]) + references = [ 'Blei, David M., Andrew Y. Ng, and Michael I. Jordan (2003). "Latent dirichlet allocation." the *Journal of machine Learning research* 3: 993-1022.', 'Blei, David M. (2003). "Topic Modeling and Digital Humanities." *Journal of Digital Humanities* 2(1).' ] - + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -85,16 +88,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on token sets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "tokenise-posts" - def process(self): """ Unzips token sets and builds topic models for each one. Model data is diff --git a/processors/text-analysis/topic_words.py b/processors/text-analysis/topic_words.py index 40564a3fc..91d37d382 100644 --- a/processors/text-analysis/topic_words.py +++ b/processors/text-analysis/topic_words.py @@ -4,6 +4,7 @@ from common.lib.helpers import UserInput from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException import pickle @@ -24,7 +25,8 @@ class TopicModelWordExtractor(BasicProcessor): description = "Creates a CSV file with the top tokens (words) per topic in the generated topic model, and their associated weights." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["wordcloud"] + # Allow processor on topic models + compatibility = Compatibility(types={"topic-modeller"}, preferred_followups=["wordcloud"]) @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: @@ -48,16 +50,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on topic models - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "topic-modeller" - def process(self): """ Extracts topics per model and top associated words diff --git a/processors/text-analysis/vectorise.py b/processors/text-analysis/vectorise.py index b9f854f59..0b591afba 100644 --- a/processors/text-analysis/vectorise.py +++ b/processors/text-analysis/vectorise.py @@ -6,6 +6,7 @@ import itertools from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -22,17 +23,8 @@ class Vectorise(BasicProcessor): description = "Counts how often a token appears in the dataset. This creates a bag of words." # description displayed in UI extension = "zip" # extension of result file, used internally and in UI - followups = ["vector-ranker"] - - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on token sets - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "tokenise-posts" + # Allow processor on token sets + compatibility = Compatibility(types={"tokenise-posts"}, preferred_followups=["vector-ranker"]) def process(self): """ diff --git a/processors/text-analysis/vectorise_by_cat.py b/processors/text-analysis/vectorise_by_cat.py index e7380ce16..eacf9a2a1 100644 --- a/processors/text-analysis/vectorise_by_cat.py +++ b/processors/text-analysis/vectorise_by_cat.py @@ -6,6 +6,7 @@ import pickle from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Dale Wahl" @@ -23,16 +24,15 @@ class VectoriseByCategory(BasicProcessor): description = "Counts all tokens per category." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["wordcloud", "render-graphs-isometric", "render-rankflow"] - - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on token sets - - :param module: Module to determine compatibility with - """ - return module.type == "tokenise-posts" + # Allow processor on token sets + compatibility = Compatibility( + types={"tokenise-posts"}, + preferred_followups=["wordcloud", "render-graphs-isometric", "render-rankflow"], + excluded_followups=[ + "consolidate-urls", "preset-neologisms", "sentence-split", "tokenise-posts", + "image-downloader-stable-diffusion", "word-trees", "histogram", "extract-urls-filter", + ], + ) @classmethod def get_options(cls, parent_dataset=None, config=None): @@ -293,13 +293,4 @@ def process(self): # Finish self.dataset.update_status("Finished") - self.dataset.finish(done) - - @classmethod - def exclude_followup_processors(cls, processor_type): - """ - Exclude followups if they are not compatible with the module - """ - if processor_type in ["consolidate-urls", "preset-neologisms", "sentence-split", "tokenise-posts", "image-downloader-stable-diffusion", "word-trees", "histogram", "extract-urls-filter"]: - return True - return False \ No newline at end of file + self.dataset.finish(done) \ No newline at end of file diff --git a/processors/twitter/aggregate_stats.py b/processors/twitter/aggregate_stats.py index f1b0c667b..c07cc9aac 100644 --- a/processors/twitter/aggregate_stats.py +++ b/processors/twitter/aggregate_stats.py @@ -7,6 +7,7 @@ from common.lib.helpers import UserInput, pad_interval, get_interval_descriptor from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorException __author__ = "Dale Wahl" @@ -25,6 +26,9 @@ class TwitterAggregatedStats(BasicProcessor): description = "Group tweets by category and count tweets per timeframe and then calculate aggregate group statistics (i.e. min, max, average, Q1, median, Q3, and trimmed mean): number of tweets, urls, hashtags, mentions, etc. \nUse for example to find the distribution of the number of tweets per author and compare across time." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) + num_of_different_categories = None @classmethod @@ -66,16 +70,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["twitterv2-search", "dmi-tcat-search"] - def trim_mean(self, values, cut_pct): """ Return mean of array after trimming a specified fraction of extreme values diff --git a/processors/twitter/base_twitter_stats.py b/processors/twitter/base_twitter_stats.py index e9d58930f..ab3d37e0a 100644 --- a/processors/twitter/base_twitter_stats.py +++ b/processors/twitter/base_twitter_stats.py @@ -6,6 +6,7 @@ from common.lib.helpers import pad_interval, get_interval_descriptor from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorException, ProcessorInterruptedException __author__ = "Dale Wahl" @@ -24,18 +25,11 @@ class TwitterStatsBase(BasicProcessor): description = "This is a class to help other twitter classes" # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - sorted = False - - - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset + # Abstract base for the Twitter statistics processors; not runnable on its + # own (an empty type set never matches a dataset). + compatibility = Compatibility(types=set()) - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return False + sorted = False def process(self): """ diff --git a/processors/twitter/custom_stats.py b/processors/twitter/custom_stats.py index 0e4d24836..e37cf73b9 100644 --- a/processors/twitter/custom_stats.py +++ b/processors/twitter/custom_stats.py @@ -4,6 +4,7 @@ from common.lib.exceptions import ProcessorException from common.lib.helpers import UserInput from processors.twitter.base_twitter_stats import TwitterStatsBase +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -21,6 +22,9 @@ class TwitterCustomStats(TwitterStatsBase): description = "Group tweets by category and count tweets per timeframe to collect aggregate group statistics.\nFor retweets and quotes, hashtags, mentions, URLs, and images from the original tweet are included in the retweet/quote. Data on public metrics (e.g., number of retweets or likes of tweets) are as of the time the data was collected." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) + sorted = 'Number of Tweets' @classmethod @@ -56,16 +60,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: }, } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["twitterv2-search", "dmi-tcat-search"] - def map_data(self, post): """ Maps a post to collect aggregate data. Returns a key for grouping data, a dictionary of aggregate data that can diff --git a/processors/twitter/hashtag_stats.py b/processors/twitter/hashtag_stats.py index df41a3ee5..52bff79f1 100644 --- a/processors/twitter/hashtag_stats.py +++ b/processors/twitter/hashtag_stats.py @@ -3,6 +3,7 @@ """ from common.lib.helpers import UserInput from processors.twitter.base_twitter_stats import TwitterStatsBase +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -20,6 +21,9 @@ class TwitterHashtagStats(TwitterStatsBase): description = "Lists by hashtag how many tweets contain hashtags, how many times those tweets have been retweeted/replied to/liked/quoted, and information about unique users and hashtags used alongside each hashtag.\nFor retweets and quotes, hashtags from the original tweet are included in the retweet/quote." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) + sorted = 'Number of Tweets containing Hashtag' @classmethod @@ -52,16 +56,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: # } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["twitterv2-search", "dmi-tcat-search"] - def map_data(self, post): """ Maps a post to collect aggregate data. Returns a key for grouping data, a dictionary of aggregate data that can diff --git a/processors/twitter/identical_tweets.py b/processors/twitter/identical_tweets.py index 5e83c734a..9dc923fe5 100644 --- a/processors/twitter/identical_tweets.py +++ b/processors/twitter/identical_tweets.py @@ -3,6 +3,7 @@ """ from common.lib.helpers import UserInput from processors.twitter.base_twitter_stats import TwitterStatsBase +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -20,6 +21,9 @@ class TwitterIdenticalTweets(TwitterStatsBase): description = "Groups tweets by text and counts the number of times they have been (re)tweeted indentically." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) + sorted = 'Number of Identical Tweets' @classmethod @@ -43,16 +47,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: }, } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["twitterv2-search", "dmi-tcat-search"] - def map_data(self, post): """ Maps a post to collect aggregate data. Returns a key for grouping data, a dictionary of aggregate data that can diff --git a/processors/twitter/mention_export.py b/processors/twitter/mention_export.py index d8f6723ef..834ee9287 100644 --- a/processors/twitter/mention_export.py +++ b/processors/twitter/mention_export.py @@ -4,6 +4,7 @@ import csv from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorException, ProcessorInterruptedException __author__ = "Dale Wahl" @@ -22,15 +23,8 @@ class TwitterMentionsExport(BasicProcessor): description = "Identifies mentions types and creates mentions table (tweet id, from author id, from username, to user id, to username, mention type)" # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["twitterv2-search"] + # Allow processor on Twitter/X (API v2) datasets + compatibility = Compatibility(types={"twitterv2-search"}) def process(self): """ @@ -150,15 +144,8 @@ class TCATMentionsExport(BasicProcessor): description = "Identifies mentions types and creates mentions table (tweet id, from author id, from username, to username)" # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager config: Configuration reader (context-aware) - """ - return module.type in ["dmi-tcat-search"] + # Allow processor on imported TCAT datasets + compatibility = Compatibility(types={"dmi-tcat-search"}) def process(self): """ diff --git a/processors/twitter/source_stats.py b/processors/twitter/source_stats.py index e74071100..e5f25e00c 100644 --- a/processors/twitter/source_stats.py +++ b/processors/twitter/source_stats.py @@ -3,6 +3,7 @@ """ from common.lib.helpers import UserInput from processors.twitter.base_twitter_stats import TwitterStatsBase +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -20,6 +21,9 @@ class TwitterHashtagStats(TwitterStatsBase): description = "Lists by source of tweet how many tweets contain hashtags, how many times those tweets have been retweeted/replied to/liked/quoted, and information about unique users and hashtags used alongside each hashtag.\nFor retweets and quotes, hashtags from the original tweet are included in the retweet/quote." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) + sorted = 'Number of Tweets from Source' @classmethod @@ -52,16 +56,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: # } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["twitterv2-search", "dmi-tcat-search"] - def map_data(self, post): """ Maps a post to collect aggregate data. Returns a key for grouping data, a dictionary of aggregate data that can diff --git a/processors/twitter/twitter_stats.py b/processors/twitter/twitter_stats.py index 4b383023e..4fe18d6df 100644 --- a/processors/twitter/twitter_stats.py +++ b/processors/twitter/twitter_stats.py @@ -3,6 +3,7 @@ """ from common.lib.helpers import UserInput from processors.twitter.base_twitter_stats import TwitterStatsBase +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -20,6 +21,9 @@ class TwitterStats(TwitterStatsBase): description = "Contains the number of tweets, number of tweets with links, number of tweets with hashtags, number of tweets with mentions, number of retweets, and number of replies" # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -47,16 +51,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["twitterv2-search", "dmi-tcat-search"] - def map_data(self, post): """ Maps a post to collect aggregate data. Returns a key for grouping data, a dictionary of sum data that can diff --git a/processors/twitter/user_stats_individual.py b/processors/twitter/user_stats_individual.py index f79750f5a..ee9cfdf91 100644 --- a/processors/twitter/user_stats_individual.py +++ b/processors/twitter/user_stats_individual.py @@ -3,6 +3,7 @@ """ from common.lib.helpers import UserInput from processors.twitter.base_twitter_stats import TwitterStatsBase +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorException __author__ = "Dale Wahl" @@ -21,6 +22,9 @@ class TwitterStats(TwitterStatsBase): description = "Lists users and their number of tweets, number of followers, number of friends, how many times they are listed, their UTC time offset, whether the user has a verified account and how many times they appear in the data set." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) + sorted = "Tweets (in interval)" @classmethod @@ -52,16 +56,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: # } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["twitterv2-search", "dmi-tcat-search"] - def map_data(self, post): """ Maps a post to collect aggregate data. Returns a key for grouping data, a dictionary of aggregate data that can diff --git a/processors/twitter/user_visibility.py b/processors/twitter/user_visibility.py index 045b8e446..36f458a15 100644 --- a/processors/twitter/user_visibility.py +++ b/processors/twitter/user_visibility.py @@ -5,6 +5,7 @@ from common.lib.helpers import get_interval_descriptor from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException from common.lib.user_input import UserInput @@ -24,6 +25,9 @@ class TwitterUserVisibility(BasicProcessor): description = "Collects usernames and totals how many tweets are authored by the user and how many tweets mention the user" # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow processor on Twitter/X datasets (API v2 or imported TCAT) + compatibility = Compatibility(types={"twitterv2-search", "dmi-tcat-search"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -45,16 +49,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine if processor is compatible with dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ["twitterv2-search", "dmi-tcat-search"] - def process(self): """ This takes a 4CAT twitter dataset file as input, and outputs a csv. diff --git a/processors/visualisation/download-telegram-images.py b/processors/visualisation/download-telegram-images.py index 0d692b66c..d69c74f89 100644 --- a/processors/visualisation/download-telegram-images.py +++ b/processors/visualisation/download-telegram-images.py @@ -8,6 +8,7 @@ from common.lib.helpers import UserInput from processors.visualisation.download_images import ImageDownloader from processors.visualisation.download_telegram_videos import TelegramVideoDownloader +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -32,7 +33,8 @@ class TelegramImageDownloader(TelegramVideoDownloader): extension = "zip" media_type = "image" - followups = ImageDownloader.followups + # coarse map spec; is_compatible_with (below) is the runtime truth (Telegram API creds) + compatibility = Compatibility(types={"telegram-search"}, preferred_followups=ImageDownloader.followups) config = { "image-downloader-telegram.max": { diff --git a/processors/visualisation/download_images.py b/processors/visualisation/download_images.py index 7bde41b03..bd13b21c7 100644 --- a/processors/visualisation/download_images.py +++ b/processors/visualisation/download_images.py @@ -14,6 +14,7 @@ from backend.lib.processor import BasicProcessor from backend.lib.proxied_requests import FailedProxiedRequest from common.lib.exceptions import ProcessorInterruptedException, FourcatException +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -44,6 +45,8 @@ class ImageDownloader(BasicProcessor): extension = "zip" # extension of result file, used internally and in UI media_type = "image" # media type of the dataset + # Shared list -- other download_* processors reuse this as ImageDownloader.followups + # (and preferred_followups below reuses it), so it stays a named attribute. followups = [ "image-wall", "image-category-wall", @@ -56,6 +59,9 @@ class ImageDownloader(BasicProcessor): "google-vision-api", ] + # top-image rankings or any collector's csv/ndjson output, except sources with their own image collection + compatibility = Compatibility(is_collector=True, types={"top-images"}, excluded_types={"tiktok-search", "tiktok-urls-search", "telegram-search", "fourchan-search"}, extensions={"csv", "ndjson"}, preferred_followups=followups) + config = { "image-downloader.max": { "type": UserInput.OPTION_TEXT, @@ -138,27 +144,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on top image rankings, collectors, but not specific collectors with their own image - collection methods - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return ( - (module.type == "top-images" or module.is_from_collector()) - and module.type - not in [ - "tiktok-search", - "tiktok-urls-search", - "telegram-search", - "fourchan-search", - ] - and module.get_extension() in ("csv", "ndjson") - ) - def process(self): """ This takes a 4CAT results file as input, and outputs a zip file with diff --git a/processors/visualisation/download_telegram_files.py b/processors/visualisation/download_telegram_files.py index 4f5c2bead..20870953d 100644 --- a/processors/visualisation/download_telegram_files.py +++ b/processors/visualisation/download_telegram_files.py @@ -14,6 +14,7 @@ from common.lib.helpers import UserInput from processors.visualisation.download_telegram_videos import TelegramVideoDownloader +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -42,9 +43,10 @@ class TelegramFileDownloader(TelegramVideoDownloader): extension = "zip" media_type = "file" - # no followups -- file outputs are heterogeneous and don't map cleanly to - # existing video/image follow-on processors - followups = [] + # coarse map spec; is_compatible_with (below) is the runtime truth (Telegram API creds). + # No preferred_followups -- file outputs are heterogeneous and don't map cleanly to + # existing video/image follow-on processors. + compatibility = Compatibility(types={"telegram-search"}, required_settings={"file-downloader-telegram.allow_files"}) config = { "file-downloader-telegram.allow_files": { diff --git a/processors/visualisation/download_telegram_videos.py b/processors/visualisation/download_telegram_videos.py index 270113713..a463d0126 100644 --- a/processors/visualisation/download_telegram_videos.py +++ b/processors/visualisation/download_telegram_videos.py @@ -22,6 +22,7 @@ class attributes to switch behavior for a different media type. from processors.visualisation.download_videos import VideoDownloaderPlus from common.lib.helpers import UserInput, timify from common.lib.dataset import DataSet +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters", "Dale Wahl"] @@ -46,7 +47,9 @@ class TelegramVideoDownloader(BasicProcessor): media_type = "video" # media type of the result flawless = True - followups = VideoDownloaderPlus.followups + # coarse map spec; is_compatible_with (below) is the runtime truth -- it also checks the + # source dataset carries Telegram API credentials, which are read from the dataset + compatibility = Compatibility(types={"telegram-search"}, required_settings={"video-downloader-telegram.allow_videos"}, preferred_followups=VideoDownloaderPlus.followups) config = { "video-downloader-telegram.max_videos": { diff --git a/processors/visualisation/download_tiktok.py b/processors/visualisation/download_tiktok.py index aed561930..158637da1 100644 --- a/processors/visualisation/download_tiktok.py +++ b/processors/visualisation/download_tiktok.py @@ -16,6 +16,7 @@ from datasources.tiktok.search_tiktok import SearchTikTok as SearchTikTokByImport from processors.visualisation.download_images import ImageDownloader from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" @@ -31,7 +32,8 @@ class TikTokImageDownloader(BasicProcessor): extension = "zip" media_type = "image" - followups = ImageDownloader.followups + # Allow processor on TikTok datasets + compatibility = Compatibility(types={"tiktok-search", "tiktok-urls-search"}, preferred_followups=ImageDownloader.followups) @classmethod def get_options(cls, parent_dataset=None, config=None): @@ -82,16 +84,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor TikTok datasets - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager config: Configuration reader (context-aware) - """ - return module.type in ["tiktok-search", "tiktok-urls-search"] - def process(self): """ Reads a file, filtering items that match in the required way, and diff --git a/processors/visualisation/download_tiktok_video.py b/processors/visualisation/download_tiktok_video.py index 8103a0933..97a809602 100644 --- a/processors/visualisation/download_tiktok_video.py +++ b/processors/visualisation/download_tiktok_video.py @@ -12,6 +12,7 @@ from processors.visualisation.download_videos import VideoDownloaderPlus from backend.lib.processor import BasicProcessor from datasources.tiktok_urls.search_tiktok_urls import TikTokScraper +from common.lib.compatibility import Compatibility class TikTokVideoDownloader(ProcessorPreset): """ @@ -26,7 +27,9 @@ class TikTokVideoDownloader(ProcessorPreset): extension = "zip" media_type = "video" - followups = VideoDownloaderPlus.followups + # coarse map spec; is_compatible_with (below) is the runtime truth -- it also accepts + # tiktok uploads, which depends on the dataset label and can't be declared statically + compatibility = Compatibility(types={"tiktok-search", "tiktok-urls-search"}, preferred_followups=VideoDownloaderPlus.followups) @classmethod def get_options(cls, parent_dataset=None, config=None): @@ -125,13 +128,9 @@ class TikTokVideoMetadata(BasicProcessor): consecutive_failures = None - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Do not show anywhere - """ - return False - + # internal helper dataset; never offered as a processor + compatibility = Compatibility(types=set()) + @classmethod def get_options(cls, parent_dataset=None, config=None): """ diff --git a/processors/visualisation/download_videos.py b/processors/visualisation/download_videos.py index d559006d6..86f0b7ed5 100644 --- a/processors/visualisation/download_videos.py +++ b/processors/visualisation/download_videos.py @@ -20,6 +20,7 @@ from backend.lib.processor import BasicProcessor from backend.lib.proxied_requests import FailedProxiedRequest +from common.lib.compatibility import Compatibility from common.lib.dataset import DataSet from common.lib.exceptions import ProcessorInterruptedException, ProcessorException, DataSetException from common.lib.helpers import UserInput, sets_to_lists, url_to_filename @@ -95,8 +96,13 @@ class VideoDownloaderPlus(BasicProcessor): extension = "zip" # extension of result file, used internally and in UI media_type = "video" # media type of the processor + # Shared list -- other download_* processors reuse this as VideoDownloaderPlus.followups + # (and preferred_followups below reuses it), so it stays a named attribute. followups = ["audio-extractor", "metadata-viewer", "video-scene-detector", "preset-scene-timelines", "video-stack", "preset-video-hashes", "video-hasher-1", "video-frames"] + # any collector's csv/ndjson output (except sources with their own downloaders), plus the tiktok-metadata helper + compatibility = Compatibility(is_collector=True, types={"tiktok-video-downloader-metadata"}, excluded_types={"tiktok-search", "tiktok-urls-search", "telegram-search"}, extensions={"csv", "ndjson"}, preferred_followups=followups) + references = [ "[YT-DLP python package](https://github.com/yt-dlp/yt-dlp/#readme)", "[Supported sites](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)", @@ -302,26 +308,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine compatibility - - Compatible with any top-level dataset. Could run on any type of dataset - in principle, but any links to videos are likely to come from the top - dataset anyway. - - :param module: Module to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return ((module.type.endswith("-search") or - module.is_from_collector() or - module.type == "tiktok-video-downloader-metadata") - # These have their own video downloaders - and module.type not in ["tiktok-search", "tiktok-urls-search", "telegram-search"]) \ - and module.get_extension() in ("csv", "ndjson") - def process(self): """ This takes a 4CAT results file as input, and downloads video files diff --git a/processors/visualisation/histwords.py b/processors/visualisation/histwords.py index 11baa85e5..1fbe4302c 100644 --- a/processors/visualisation/histwords.py +++ b/processors/visualisation/histwords.py @@ -12,6 +12,7 @@ from gensim.models import KeyedVectors from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput, convert_to_int, get_4cat_canvas, convert_to_float from common.lib.exceptions import ProcessorInterruptedException @@ -41,6 +42,9 @@ class HistWordsVectorSpaceVisualiser(BasicProcessor): description = "Visualise nearest neighbours of a given query across all models and show the closest neighbours per model in one combined graph. Based on the 'HistWords' algorithm by Hamilton et al." # description displayed in UI extension = "svg" # extension of result file, used internally and in UI + # Allow processor on word embedding models + compatibility = Compatibility(types={"generate-embeddings"}) + references = [ "HistWords: [Hamilton, W. L., Leskovec, J., & Jurafsky, D. (2016). Diachronic word embeddings reveal statistical laws of semantic change. *arXiv preprint** arXiv:1605.09096.](https://arxiv.org/pdf/1605.09096.pdf)", "HistWords: [William L. Hamilton, Jure Leskovec, and Dan Jurafsky. HistWords: Word Embeddings for Historical Text](https://nlp.stanford.edu/projects/histwords/)", @@ -104,16 +108,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on token sets - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "generate-embeddings" - def process(self): # parse parameters input_words = self.parameters.get("words", "") diff --git a/processors/visualisation/image_category_wall.py b/processors/visualisation/image_category_wall.py index 95acc9627..7dd9df5c2 100644 --- a/processors/visualisation/image_category_wall.py +++ b/processors/visualisation/image_category_wall.py @@ -17,6 +17,7 @@ from common.lib.helpers import UserInput, convert_to_int, get_4cat_canvas from backend.lib.processor import BasicProcessor from common.lib.exceptions import ProcessorInterruptedException +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl", "Stijn Peeters"] @@ -36,6 +37,9 @@ class ImageCategoryWallGenerator(BasicProcessor): description = "Combine images into a single image arranged by category" # description displayed in UI extension = "svg" # extension of result file, used internally and in UI + # image-category, image-downloader, or video-hash datasets (except screenshot downloads) + compatibility = Compatibility(type_prefixes={"image-to-categories", "image-downloader", "video-hasher-1", "video-hash-similarity-matrix"}, excluded_types={"image-downloader-screenshots-search"}) + number_of_ranges = 10 # number of ranges to use for numeric categories image_datasets = ["image-downloader", "video-hasher-1"] @@ -57,21 +61,6 @@ class ImageCategoryWallGenerator(BasicProcessor): } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on CLIP dataset only - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return ( - module.type.startswith("image-to-categories") - or module.type.startswith("image-downloader") - or module.type.startswith("video-hasher-1") - or module.type.startswith("video-hash-similarity-matrix") - ) and module.type not in ["image-downloader-screenshots-search"] - @classmethod def get_options(cls, parent_dataset=None, config=None): """ diff --git a/processors/visualisation/image_wall.py b/processors/visualisation/image_wall.py index 2438dac61..ebbb25f2b 100644 --- a/processors/visualisation/image_wall.py +++ b/processors/visualisation/image_wall.py @@ -4,8 +4,8 @@ from PIL import Image, ImageOps, UnidentifiedImageError from sklearn.cluster import KMeans from common.lib.helpers import UserInput +from common.lib.compatibility import Compatibility, ExecutableSibling import colorsys -import shutil import copy from processors.visualisation.video_wall import VideoWallGenerator @@ -30,27 +30,8 @@ class ImageWallGenerator(VideoWallGenerator): description = "Put all images in a single combined image, side by side. Images can be sorted and resized." extension = "png" - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Check compatibility - - This processor can run 1) if ffmpeg is available and 2) if the source - is an image or video dataset - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - ffmpeg_path = shutil.which(config.get("video-downloader.ffmpeg_path")) - ffprobe_path = ( - shutil.which("ffprobe".join(ffmpeg_path.rsplit("ffmpeg", 1))) - if ffmpeg_path - else None - ) - have_ffmpeg = (ffmpeg_path and ffprobe_path) - return have_ffmpeg and (module.get_media_type() in ("video", "image") - or module.type.startswith("image-downloader") - or module.type == "video-frames") + # Allow on image/video datasets when ffmpeg and ffprobe are available + compatibility = Compatibility(media_types={"video", "image"}, type_prefixes={"image-downloader"}, types={"video-frames"}, required_settings={("video-downloader.ffmpeg_path", ExecutableSibling("ffmpeg", "ffprobe"))}) @classmethod def get_options(cls, parent_dataset=None, config=None): diff --git a/processors/visualisation/image_wall_w_text.py b/processors/visualisation/image_wall_w_text.py index cdc096323..71b4f753e 100644 --- a/processors/visualisation/image_wall_w_text.py +++ b/processors/visualisation/image_wall_w_text.py @@ -18,6 +18,7 @@ from common.lib.helpers import UserInput, convert_to_int, get_4cat_canvas from backend.lib.processor import BasicProcessor from common.lib.exceptions import ProcessorInterruptedException +from common.lib.compatibility import Compatibility __author__ = "Dale Wahl" __credits__ = ["Dale Wahl", "Stijn Peeters"] @@ -39,6 +40,10 @@ class ImageTextWallGenerator(BasicProcessor): caption_datasets = ["image-captions", "text-from-images"] combined_dataset = ["image-downloader-stable-diffusion"] + # coarse map spec; is_compatible_with (below) is the runtime truth -- it walks the + # genealogy (identity_dataset_types) to confirm both an image and a text/caption dataset + compatibility = Compatibility(types=set(combined_dataset), type_prefixes=set(caption_datasets)) + @classmethod def is_compatible_with(cls, module=None, config=None): """ diff --git a/processors/visualisation/isoviz.py b/processors/visualisation/isoviz.py index 697e21bfc..a1199abda 100644 --- a/processors/visualisation/isoviz.py +++ b/processors/visualisation/isoviz.py @@ -6,6 +6,7 @@ from backend.lib.processor import BasicProcessor from common.lib.helpers import UserInput, convert_to_int, pad_interval, get_4cat_canvas +from common.lib.compatibility import Compatibility from calendar import month_abbr from math import sin, cos, tan, degrees, radians, copysign @@ -35,6 +36,9 @@ class IsometricMultigraphRenderer(BasicProcessor): description = "Generate area graphs showing prevalence per item over time. These are visualised side-by-side on an isometric plane for easy comparison." # description displayed in UI extension = "svg" # extension of result file, used internally and in UI + # rankable datasets with a single value per item (multiple_items=False) + compatibility = Compatibility(rankable=True, rankable_multiple_items=False) + # a palette generated with https://medialab.github.io/iwanthue/ colours = ["#eb010a", "#495dff", "#f35f00", "#5137e0", "#ffeb45", "#d05edf", "#00cb3a", "#b200c7", "#d8fd5d", "#a058ff", "#b90fd4", "#6fb300", @@ -85,18 +89,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on rankable items - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - if module.is_dataset(): - return module.is_rankable(multiple_items=False) - return False - def process(self): graphs = {} intervals = [] diff --git a/processors/visualisation/rankflow.py b/processors/visualisation/rankflow.py index a8ad588ab..99397321d 100644 --- a/processors/visualisation/rankflow.py +++ b/processors/visualisation/rankflow.py @@ -7,6 +7,7 @@ from backend.lib.processor import BasicProcessor from common.lib.helpers import UserInput, get_4cat_canvas from common.lib.exceptions import ProcessorInterruptedException +from common.lib.compatibility import Compatibility from svgwrite.shapes import Rect from svgwrite.path import Path @@ -41,6 +42,9 @@ class RankFlowRenderer(BasicProcessor): ) # description displayed in UI extension = "svg" # extension of result file, used internally and in UI + # rankable datasets, including multi-column rankings (e.g. top vectors per interval) + compatibility = Compatibility(rankable=True) + references = [ "[Rieder, B. RankFlow. *The Politics of Systems*](https://labs.polsys.net/tools/rankflow/)" ] @@ -133,16 +137,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: }, } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on rankable items - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_rankable() - def process(self): """ Render RankFlow diagram diff --git a/processors/visualisation/vector_histogram.py b/processors/visualisation/vector_histogram.py index 725960e08..e01f2ba22 100644 --- a/processors/visualisation/vector_histogram.py +++ b/processors/visualisation/vector_histogram.py @@ -12,6 +12,7 @@ from backend.lib.processor import BasicProcessor from common.lib.helpers import UserInput, pad_interval, get_4cat_canvas +from common.lib.compatibility import Compatibility __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] @@ -28,6 +29,9 @@ class SVGHistogramRenderer(BasicProcessor): description = "Generates a histogram from time frequencies." # description displayed in UI extension = "svg" + # rankable datasets with a single value per item (multiple_items=False) + compatibility = Compatibility(rankable=True, rankable_multiple_items=False) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -48,16 +52,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on rankable items - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.is_rankable(multiple_items=False) - def process(self): """ Render an SVG histogram/bar chart using a previous frequency analysis diff --git a/processors/visualisation/video_frames.py b/processors/visualisation/video_frames.py index 3ce17dcca..9a24621fa 100644 --- a/processors/visualisation/video_frames.py +++ b/processors/visualisation/video_frames.py @@ -8,6 +8,7 @@ import oslex from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility, is_executable from common.lib.exceptions import ProcessorInterruptedException from common.lib.user_input import UserInput from processors.visualisation.download_videos import VideoDownloaderPlus @@ -30,7 +31,8 @@ class VideoFrames(BasicProcessor): description = "Extract frames from videos" # description displayed in UI extension = "zip" # extension of result file, used internally and in UI - followups = ["video-timelines"] + VideoDownloaderPlus.followups + # Allow on video datasets when ffmpeg is available + compatibility = Compatibility(media_types={"video"}, type_prefixes={"video-downloader"}, required_settings={("video-downloader.ffmpeg_path", is_executable)}, preferred_followups=["video-timelines"] + VideoDownloaderPlus.followups) @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: @@ -67,17 +69,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: }, } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on videos - - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return (module.get_media_type() == "video" or module.type.startswith("video-downloader")) and \ - config.get("video-downloader.ffmpeg_path") and \ - shutil.which(config.get("video-downloader.ffmpeg_path")) - def process(self): """ This takes a zipped set of videos, uses https://pypi.org/project/videohash/ and https://ffmpeg.org/ to collect diff --git a/processors/visualisation/video_hasher.py b/processors/visualisation/video_hasher.py index ae4a84aa6..8271e7b69 100644 --- a/processors/visualisation/video_hasher.py +++ b/processors/visualisation/video_hasher.py @@ -16,6 +16,7 @@ from backend.lib.processor import BasicProcessor from backend.lib.preset import ProcessorAdvancedPreset +from common.lib.compatibility import Compatibility, is_executable from common.lib.exceptions import ProcessorInterruptedException, ProcessorException from common.lib.user_input import UserInput @@ -35,6 +36,9 @@ class VideoHasherPreset(ProcessorAdvancedPreset): description = "Creates video hashes (64 bits/identifiers) to identify near duplicate videos in a dataset based on hash similarity. Uses video only. This process can take a long time depending on video length, amount, and frames per second." extension = "gexf" + # video datasets, when ffmpeg is available + compatibility = Compatibility(media_types={"video"}, type_prefixes={"video-downloader"}, required_settings={("video-downloader.ffmpeg_path", is_executable)}) + @classmethod def get_options(cls, parent_dataset=None, config=None): return { @@ -69,21 +73,6 @@ def get_options(cls, parent_dataset=None, config=None): } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine compatibility - - Compatible with downloaded videos, and not really anything else! - Additionally ffmpeg needs to be available. - - :param DataSet module: Module ID to determine compatibility with - :return bool: - """ - return (module.get_media_type() == "video" or module.type.startswith("video-downloader")) and \ - config.get("video-downloader.ffmpeg_path") and \ - shutil.which(config.get("video-downloader.ffmpeg_path")) - def get_processor_advanced_pipeline(self, attach_to=None): """ This queues a series of post-processors to visualise videos. @@ -145,7 +134,8 @@ class VideoHasher(BasicProcessor): extension = "zip" # extension of result file, used internally and in UI media_type = "image" # media type of the result - followups = ["video-hash-network", "video-hash-similarity-matrix"] + # video datasets (collages are made from video frames) + compatibility = Compatibility(media_types={"video"}, type_prefixes={"video-downloader"}, preferred_followups=["video-hash-network", "video-hash-similarity-matrix"]) @classmethod def get_options(cls, parent_dataset=None, config=None): @@ -174,13 +164,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on videos only - """ - return module.get_media_type() == "video" or module.type.startswith("video-downloader") - def process(self): """ This takes a zipped set of videos, uses https://pypi.org/project/videohash/ and https://ffmpeg.org/ to collect @@ -356,6 +339,9 @@ class VideoHashNetwork(BasicProcessor): description = "Creates hashes network to identify duplicate or similar videos." # description displayed in UI extension = "gexf" # extension of result file, used internally and in UI + # Allow on video hasher + compatibility = Compatibility(types={"video-hasher-1"}) + references = [ "[Video Hash](https://github.com/akamhy/videohash#readme)", ] @@ -370,13 +356,6 @@ def get_options(cls, parent_dataset=None, config=None): "max": 100 }} - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on video hasher - """ - return module.type in ["video-hasher-1"] - def process(self): """ @@ -474,6 +453,9 @@ class VideoHashSimilarities(BasicProcessor): description = "Creates CSV with hashes and groups videos above similarity value." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + # Allow on video hasher + compatibility = Compatibility(types={"video-hasher-1"}) + references = [ "[Video Hash](https://github.com/akamhy/videohash#readme)", ] @@ -488,13 +470,6 @@ def get_options(cls, parent_dataset=None, config=None): "max": 100 }} - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on video hasher - """ - return module.type in ["video-hasher-1"] - def process(self): """ diff --git a/processors/visualisation/video_scene_frames.py b/processors/visualisation/video_scene_frames.py index 7439e4da6..65e136fee 100644 --- a/processors/visualisation/video_scene_frames.py +++ b/processors/visualisation/video_scene_frames.py @@ -12,6 +12,7 @@ from packaging import version from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility, is_executable from common.lib.user_input import UserInput from common.lib.helpers import get_ffmpeg_version @@ -33,7 +34,8 @@ class VideoSceneFrames(BasicProcessor): description = "For each scene identified, extracts a key frame (e.g. the first frame)." # description displayed in UI extension = "zip" # extension of result file, used internally and in UI - followups = ["video-timelines"] + # Allow on detected video scenes when ffmpeg is available + compatibility = Compatibility(types={"video-scene-detector"}, required_settings={("video-downloader.ffmpeg_path", is_executable)}, preferred_followups=["video-timelines"]) @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: @@ -72,20 +74,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine compatibility - - Compatible with scene data only. - - :param str module: Module ID to determine compatibility with - :return bool: - """ - return module.type in ["video-scene-detector"] and \ - config.get("video-downloader.ffmpeg_path") and \ - shutil.which(config.get("video-downloader.ffmpeg_path")) - def process(self): """ This takes a zipped set of videos, uses https://pypi.org/project/videohash/ and https://ffmpeg.org/ to collect diff --git a/processors/visualisation/video_scene_identifier.py b/processors/visualisation/video_scene_identifier.py index c7731aca4..7fa6c03b8 100644 --- a/processors/visualisation/video_scene_identifier.py +++ b/processors/visualisation/video_scene_identifier.py @@ -10,6 +10,7 @@ from scenedetect import open_video, SceneManager, VideoOpenFailure, FrameTimecode from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException, ProcessorException from common.lib.user_input import UserInput @@ -39,7 +40,8 @@ class VideoSceneDetector(BasicProcessor): "intensity or cuts and fades to black) and extract the scene metadata." # description displayed in UI extension = "csv" # extension of result file, used internally and in UI - followups = ["video-scene-frames", "video-timelines"] + # Allow on video datasets + compatibility = Compatibility(media_types={"video"}, type_prefixes={"video-downloader"}, preferred_followups=["video-scene-frames", "video-timelines"]) references = [ "[PySceneDetect](https://github.com/Breakthrough/PySceneDetect)", @@ -205,13 +207,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow on videos - """ - return module.get_media_type() == "video" or module.type.startswith("video-downloader") - def process(self): """ This takes a zipped set of videos, uses https://github.com/Breakthrough/PySceneDetect to detect scene breaks in diff --git a/processors/visualisation/video_stack.py b/processors/visualisation/video_stack.py index 390d9f7f3..a2f2fcac5 100644 --- a/processors/visualisation/video_stack.py +++ b/processors/visualisation/video_stack.py @@ -12,6 +12,7 @@ from packaging import version from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility, ExecutableSibling from common.lib.exceptions import ProcessorInterruptedException from common.lib.user_input import UserInput from common.lib.helpers import get_ffmpeg_version @@ -36,6 +37,9 @@ class VideoStack(BasicProcessor): "videos. Videos are stacked by length, i.e. the longest video is at the 'bottom' of the stack." # description displayed in UI extension = "mp4" # extension of result file, used internally and in UI + # Allow on video datasets when ffmpeg and ffprobe are available + compatibility = Compatibility(media_types={"video"}, type_prefixes={"video-downloader"}, required_settings={("video-downloader.ffmpeg_path", ExecutableSibling("ffmpeg", "ffprobe"))}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -98,25 +102,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine compatibility - - :param DataSet module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - if not (module.get_media_type() == "video" or module.type.startswith("video-downloader")): - return False - else: - # Only check these if we have a video dataset - # also need ffprobe to determine video lengths - # is usually installed in same place as ffmpeg - ffmpeg_path = shutil.which(config.get("video-downloader.ffmpeg_path")) - ffprobe_path = shutil.which("ffprobe".join(ffmpeg_path.rsplit("ffmpeg", 1))) if ffmpeg_path else None - return ffmpeg_path and ffprobe_path - def process(self): """ This takes a zipped set of videos, uses https://pypi.org/project/videohash/ and https://ffmpeg.org/ to collect diff --git a/processors/visualisation/video_timelines.py b/processors/visualisation/video_timelines.py index a57ff2ea7..e2036c4bc 100644 --- a/processors/visualisation/video_timelines.py +++ b/processors/visualisation/video_timelines.py @@ -14,6 +14,7 @@ from ural import is_url from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException from common.lib.user_input import UserInput from common.lib.helpers import get_4cat_canvas @@ -38,6 +39,10 @@ class VideoTimelines(BasicProcessor): "collage of sequential frames). Timelines are then vertically stacked." # description displayed in UI extension = "svg" # extension of result file, used internally and in UI + # Compatible with extracted video frames (or anything that stores related + # images in separate folders within a zip archive). + compatibility = Compatibility(types={"video-frames", "video-scene-frames"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -62,21 +67,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine compatibility - - Compatible with 'Extract video frames'. Can in principle run on - anything that stores related images in separate folders in a zip - archive. Each folder will be rendered as a separate timeline. - - :param str module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - return module.type in ["video-frames", "video-scene-frames"] - def process(self): metadata = {} base_height = self.parameters.get("height", 100) diff --git a/processors/visualisation/video_wall.py b/processors/visualisation/video_wall.py index fb4f54946..74800834c 100644 --- a/processors/visualisation/video_wall.py +++ b/processors/visualisation/video_wall.py @@ -12,6 +12,7 @@ from common.lib.helpers import UserInput, get_ffmpeg_version, convert_to_int from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility, ExecutableSibling from common.lib.exceptions import ProcessorInterruptedException, MediaSignatureException __author__ = "Stijn Peeters" @@ -34,6 +35,9 @@ class VideoWallGenerator(BasicProcessor): description = "Put all videos in a single combined video, side by side. Videos can be sorted and resized." extension = "mp4" # extension of result file, used internally and in UI + # Allow on video datasets when ffmpeg and ffprobe are available + compatibility = Compatibility(media_types={"video"}, type_prefixes={"video-downloader"}, required_settings={("video-downloader.ffmpeg_path", ExecutableSibling("ffmpeg", "ffprobe"))}) + # videos will be arranged and resized to fit these image wall dimensions # note that video aspect ratio may not allow for a precise fit TARGET_DIMENSIONS = { @@ -128,25 +132,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Determine compatibility - - :param DataSet module: Module ID to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - :return bool: - """ - if not (module.get_media_type() == "video" or module.type.startswith("video-downloader")): - return False - else: - # Only check these if we have a video dataset - # also need ffprobe to determine video lengths - # is usually installed in same place as ffmpeg - ffmpeg_path = shutil.which(config.get("video-downloader.ffmpeg_path")) - ffprobe_path = shutil.which("ffprobe".join(ffmpeg_path.rsplit("ffmpeg", 1))) if ffmpeg_path else None - return ffmpeg_path and ffprobe_path - def process(self): """ Go through media files, determine dimensions, sort according to the diff --git a/processors/visualisation/word-cloud.py b/processors/visualisation/word-cloud.py index eea5b5aaa..be922a0a5 100644 --- a/processors/visualisation/word-cloud.py +++ b/processors/visualisation/word-cloud.py @@ -6,6 +6,7 @@ from wordcloud import WordCloud from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput __author__ = "Sal Hagen" @@ -24,15 +25,11 @@ class MakeWordCloud(BasicProcessor): description = "Generates a word cloud with words sized on occurrence." # description displayed in UI extension = "svg" - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on rankable items - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type in ("tfidf", "collocations", "vector-ranker", "vectorise-tokens-by-category", "similar-word2vec", "extract-nouns", "get-entities") + # Allow processor on rankable items + compatibility = Compatibility(types={ + "tfidf", "collocations", "vector-ranker", "vectorise-tokens-by-category", + "similar-word2vec", "extract-nouns", "get-entities" + }) @classmethod def get_options(cls, parent_dataset=None, config=None): diff --git a/processors/visualisation/word-trees.py b/processors/visualisation/word-trees.py index e64e96410..0bbaf602a 100644 --- a/processors/visualisation/word-trees.py +++ b/processors/visualisation/word-trees.py @@ -8,6 +8,7 @@ from backend.lib.processor import BasicProcessor from common.lib.helpers import UserInput, convert_to_int, get_4cat_canvas from common.lib.exceptions import QueryParametersException +from common.lib.compatibility import Compatibility from nltk.tokenize import word_tokenize @@ -168,6 +169,9 @@ class MakeWordtree(BasicProcessor): description = "Generates a word tree for a given query, a \"graphical version of the traditional 'keyword-in-context' method\" (Wattenberg & Viégas, 2008)." # description displayed in UI extension = "svg" # extension of result file, used internally and in UI + # any csv or ndjson dataset + compatibility = Compatibility(extensions={"csv", "ndjson"}) + references = [ "Wattenberg, M., & Viégas, F. B. (2008). [The Word Tree, an Interactive Visual Concordance](https://doi.org/10.1109/TVCG.2008.172). IEEE Transactions on Visualization and Computer Graphics, 14(6), 1221–1228." ] @@ -307,16 +311,6 @@ def get_options(cls, parent_dataset=None, config=None): return options - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor to run on all csv and NDJSON datasets - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.get_extension() in ("csv", "ndjson") - def process(self): """ This takes a 4CAT results file as input, and outputs a plain text file diff --git a/processors/visualisation/youtube_imagewall.py b/processors/visualisation/youtube_imagewall.py index 96285aa08..94e9c969d 100644 --- a/processors/visualisation/youtube_imagewall.py +++ b/processors/visualisation/youtube_imagewall.py @@ -10,6 +10,7 @@ from PIL import Image, ImageOps, ImageDraw, ImageFont from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.helpers import UserInput, convert_to_int __author__ = "Sal Hagen" @@ -32,6 +33,9 @@ class YouTubeImageWall(BasicProcessor): description = "Make an image wall from YouTube video thumbnails." # description displayed in UI extension = "png" # extension of result file, used internally and in UI + # Allow processor on YouTube thumbnail sets + compatibility = Compatibility(types={"youtube-thumbnails"}) + @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: """ @@ -56,16 +60,6 @@ def get_options(cls, parent_dataset=None, config=None) -> dict: } } - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on YouTube thumbnail sets - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "youtube-thumbnails" - def process(self): """ Takes the thumbnails downloaded from YouTube metadata and diff --git a/processors/visualisation/youtube_thumbnails.py b/processors/visualisation/youtube_thumbnails.py index e5feb4249..111cd6a43 100644 --- a/processors/visualisation/youtube_thumbnails.py +++ b/processors/visualisation/youtube_thumbnails.py @@ -7,6 +7,7 @@ from apiclient.discovery import build from backend.lib.processor import BasicProcessor +from common.lib.compatibility import Compatibility from common.lib.exceptions import ProcessorInterruptedException from common.lib.helpers import get_yt_compatible_ids, UserInput @@ -18,7 +19,7 @@ class YouTubeThumbnails(BasicProcessor): """ - + Downloads YouTube thumbnails. """ @@ -30,21 +31,12 @@ class YouTubeThumbnails(BasicProcessor): extension = "zip" # extension of result file, used internally and in UI media_type = "image" # media type of the result - followups = ["youtube-imagewall"] + # Allow processor on YouTube metadata sets + compatibility = Compatibility(types={"youtube-metadata"}, preferred_followups=["youtube-imagewall"]) max_retries = 3 sleep_time = 10 - @classmethod - def is_compatible_with(cls, module=None, config=None): - """ - Allow processor on YouTube metadata sets - - :param module: Dataset or processor to determine compatibility with - :param ConfigManager|None config: Configuration reader (context-aware) - """ - return module.type == "youtube-metadata" - @classmethod def get_options(cls, parent_dataset=None, config=None) -> dict: @@ -86,7 +78,7 @@ def download_thumbnails(self, video_ids): # prepare staging area results_path = self.dataset.get_staging_area() - + api_key = self.parameters.get("key") if not api_key: api_key = self.config.get("api.youtube.key") @@ -94,7 +86,7 @@ def download_thumbnails(self, video_ids): self.dataset.finish_with_error("You need to provide a valid API key") return self.api_key = api_key - + # Use YouTubeDL and the YouTube API to request video data youtube = build("youtube", "v3", developerKey=api_key) diff --git a/webtool/views/api_standalone.py b/webtool/views/api_standalone.py index 796562fc6..0362171ac 100644 --- a/webtool/views/api_standalone.py +++ b/webtool/views/api_standalone.py @@ -137,7 +137,7 @@ def is_rankable(self, multiple_items=False): continue # Check if the processor is compatible with the fake dataset - if hasattr(processor, "is_compatible_with") and not processor.is_compatible_with(fake_dataset): + if not processor.is_compatible_with(fake_dataset, g.config): continue available_processors[processor_type] = processor diff --git a/webtool/views/api_tool.py b/webtool/views/api_tool.py index b4068d361..d1488d997 100644 --- a/webtool/views/api_tool.py +++ b/webtool/views/api_tool.py @@ -209,7 +209,7 @@ def get_processor_options(processor_type, dataset_id=None): return error(404, message="Dataset '%s' does not exist" % dataset_id) # Check compatibility of processor with dataset - if hasattr(processor, "is_compatible_with") and not processor.is_compatible_with(dataset, g.config): + if not processor.is_compatible_with(dataset, g.config): return error(422, message="Processor '%s' is not compatible with dataset '%s'" % (processor_type, dataset_id)) worker_options = processor.get_options(dataset, g.config)