Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions otdf-local/src/otdf_local/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,27 @@ def up(
bool,
typer.Option("--no-provision", help="Skip provisioning step"),
] = False,
tracing: Annotated[
bool,
typer.Option(
"--tracing",
help="Start Jaeger and export OpenTelemetry traces from platform/KAS",
),
] = False,
) -> None:
"""Start the test environment.

By default starts all services: docker (keycloak, postgres), platform, and all KAS instances.
"""
settings = get_settings()
# Enable tracing on the shared settings so every service (docker profile,
# platform/KAS config generation) picks it up.
settings.tracing = tracing
settings.ensure_directories()

if tracing:
print_info(f"Tracing enabled — Jaeger UI at {settings.jaeger_ui_url}")

# Parse services to start
if services:
service_list = [s.strip().lower() for s in services.split(",")]
Expand Down Expand Up @@ -591,6 +604,16 @@ def env(
root_key = get_nested(platform_config, "services.kas.root_key")
if root_key:
env_vars["OT_ROOT_KEY"] = root_key

# If the running platform is exporting traces, point clients (pytest,
# SDK CLIs) at the same collector so their spans join the trace.
if get_nested(platform_config, "server.trace.enabled"):
endpoint = get_nested(
platform_config,
"server.trace.provider.otlp.endpoint",
settings.otlp_endpoint,
)
env_vars["OTEL_EXPORTER_OTLP_ENDPOINT"] = endpoint
except Exception as e:
print_warning(f"Could not read root key from platform config: {e}")

Expand Down
20 changes: 20 additions & 0 deletions otdf-local/src/otdf_local/config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from functools import lru_cache
from pathlib import Path
from typing import Any

from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
Expand Down Expand Up @@ -146,6 +147,25 @@ def docker_compose_file(self) -> Path:
# Log level
log_level: str = "info"

# OpenTelemetry tracing (opt-in via `otdf-local up --tracing`)
tracing: bool = False
otlp_endpoint: str = "localhost:4317"
jaeger_ui_url: str = "http://localhost:16686"

def trace_config_updates(self) -> dict[str, Any]:
"""Platform/KAS config overrides that export OTel traces to the local
collector (Jaeger). Empty when tracing is disabled so the generated
config is unchanged for normal runs.
"""
if not self.tracing:
return {}
return {
"server.trace.enabled": True,
"server.trace.provider.name": "otlp",
"server.trace.provider.otlp.endpoint": self.otlp_endpoint,
"server.trace.provider.otlp.insecure": True,
}

def get_kas_port(self, name: str) -> int:
"""Get port for a KAS instance."""
return Ports.get_kas_port(name)
Expand Down
13 changes: 11 additions & 2 deletions otdf-local/src/otdf_local/services/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,22 @@ def service_type(self) -> ServiceType:
def health_url(self) -> str:
return f"http://localhost:{Ports.KEYCLOAK}/auth/realms/master"

def _compose_cmd(self, *args: str) -> list[str]:
"""Build a `docker compose` command, opting into the `tracing` profile
(which includes the Jaeger all-in-one container) when tracing is enabled.
"""
cmd = ["docker", "compose", "-f", str(self._compose_file)]
if self.settings.tracing:
cmd += ["--profile", "tracing"]
return cmd + list(args)
Comment on lines +35 to +42

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

When running otdf-local down, the CLI runs in a new process where self.settings.tracing is False (since --tracing is only an option on the up command). Consequently, _compose_cmd will not include the tracing profile, and the Jaeger container will be left running (orphaned) after down. We should ensure the tracing profile is included when stopping services (i.e., when "down" is in the arguments).

Suggested change
def _compose_cmd(self, *args: str) -> list[str]:
"""Build a `docker compose` command, opting into the `tracing` profile
(which includes the Jaeger all-in-one container) when tracing is enabled.
"""
cmd = ["docker", "compose", "-f", str(self._compose_file)]
if self.settings.tracing:
cmd += ["--profile", "tracing"]
return cmd + list(args)
def _compose_cmd(self, *args: str) -> list[str]:
"""Build a `docker compose` command, opting into the `tracing` profile
(which includes the Jaeger all-in-one container) when tracing is enabled.
"""
cmd = ["docker", "compose", "-f", str(self._compose_file)]
if self.settings.tracing or "down" in args:
cmd += ["--profile", "tracing"]
return cmd + list(args)


def start(self) -> bool:
"""Start Docker compose services."""
if not self._compose_file.exists():
return False

result = subprocess.run(
["docker", "compose", "-f", str(self._compose_file), "up", "-d"],
self._compose_cmd("up", "-d"),
capture_output=True,
text=True,
cwd=self._compose_file.parent,
Expand All @@ -51,7 +60,7 @@ def stop(self) -> bool:
return False

result = subprocess.run(
["docker", "compose", "-f", str(self._compose_file), "down"],
self._compose_cmd("down"),
capture_output=True,
text=True,
cwd=self._compose_file.parent,
Expand Down
3 changes: 3 additions & 0 deletions otdf-local/src/otdf_local/services/kas.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ def _generate_config(self) -> Path:
# registered_kas_uri should NOT have /kas suffix
updates["services.kas.registered_kas_uri"] = f"http://localhost:{self.port}"

# Export traces to the local collector (Jaeger) when tracing is enabled
updates.update(self.settings.trace_config_updates())

copy_yaml_with_updates(template_path, config_path, updates)
return config_path

Expand Down
3 changes: 3 additions & 0 deletions otdf-local/src/otdf_local/services/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ def _generate_config(self) -> Path:
"logger.output": logger_output,
}

# Export traces to the local collector (Jaeger) when tracing is enabled
updates.update(self.settings.trace_config_updates())

copy_yaml_with_updates(template_path, config_path, updates)

# Set up golden keys for legacy TDF tests
Expand Down
7 changes: 7 additions & 0 deletions xtest/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def pytest_report_header() -> list[str]:
"fixtures.keys",
"fixtures.audit",
"fixtures.encryption",
"fixtures.tracing",
]


Expand Down Expand Up @@ -129,6 +130,12 @@ def pytest_addoption(parser: pytest.Parser):
action="store_true",
help="skip round-trip tests where all SDKs are released artifacts",
)
parser.addoption(
"--tracing",
action="store_true",
help="wrap each test in an OpenTelemetry span and export it to the OTLP "
"collector (also enabled by setting OTEL_EXPORTER_OTLP_ENDPOINT)",
)
parser.addoption(
"--sdks",
help=f"select which sdks to run by default, unless overridden; one or more of {englist(typing.get_args(tdfs.sdk_type))}, optionally version-qualified (e.g. go@main, go@v0.18.0, go@*)",
Expand Down
142 changes: 142 additions & 0 deletions xtest/fixtures/tracing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
"""Pytest fixtures for end-to-end OpenTelemetry tracing.

Wraps each test in a ``pytest.test`` span and exports a ``TRACEPARENT`` into the
environment so the SDK CLI subprocess (and, through it, platform/KAS) join the
same trace. On failure the trace's Jaeger URL is printed so the failure links
directly to the full request chain.

Tracing is opt-in and a strict no-op unless enabled — it activates when either
``--tracing`` is passed or ``OTEL_EXPORTER_OTLP_ENDPOINT`` is set. When disabled
nothing is imported or initialized, so normal runs pay no cost.

See ``fixtures/audit.py`` for the sibling log-collection fixture this mirrors.
"""

import logging
import os
from collections.abc import Iterator
from dataclasses import dataclass
Comment on lines +15 to +18

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Import Any from typing to allow proper typing of the TracingSession dataclass fields without requiring module-level imports of OpenTelemetry packages.

Suggested change
import logging
import os
from collections.abc import Iterator
from dataclasses import dataclass
import logging
import os
from collections.abc import Iterator
from dataclasses import dataclass
from typing import Any


import pytest

logger = logging.getLogger("xtest")

# Default local collector (Jaeger all-in-one, OTLP gRPC) and UI, matching the
# `tracing` docker-compose profile started by `otdf-local up --tracing`.
_DEFAULT_OTLP_ENDPOINT = "localhost:4317"
_DEFAULT_JAEGER_UI = "http://localhost:16686"


@dataclass
class TracingSession:
"""Holds the initialized tracer and the Jaeger UI base URL for a session."""

tracer: object # opentelemetry.trace.Tracer
provider: object # opentelemetry.sdk.trace.TracerProvider
Comment on lines +34 to +35

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Typing tracer and provider as object requires using # type: ignore[attr-defined] when calling methods on them. Typing them as Any avoids type-checking errors and allows removing the type ignores.

Suggested change
tracer: object # opentelemetry.trace.Tracer
provider: object # opentelemetry.sdk.trace.TracerProvider
tracer: Any # opentelemetry.trace.Tracer
provider: Any # opentelemetry.sdk.trace.TracerProvider

jaeger_ui_url: str


@pytest.fixture(scope="session")
def _tracing(request: pytest.FixtureRequest) -> Iterator[TracingSession | None]:
"""Initialize an OTLP tracer for the session, or yield None when disabled.

Enabled when ``--tracing`` is passed or ``OTEL_EXPORTER_OTLP_ENDPOINT`` is
set; otherwise this is a no-op and no OpenTelemetry code runs.
"""
endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
enabled = bool(request.config.getoption("--tracing", default=False)) or bool(
endpoint
)
if not enabled:
yield None
return

endpoint = endpoint or _DEFAULT_OTLP_ENDPOINT

# Imported lazily so the disabled path never requires the OTel packages.
from opentelemetry import trace
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor

provider = TracerProvider(resource=Resource.create({"service.name": "xtest"}))
provider.add_span_processor(
BatchSpanProcessor(OTLPSpanExporter(endpoint=endpoint, insecure=True))
)
trace.set_tracer_provider(provider)
logger.info("xtest tracing enabled, exporting to %s", endpoint)

session = TracingSession(
tracer=provider.get_tracer("xtest"),
provider=provider,
jaeger_ui_url=os.getenv("JAEGER_UI_URL", _DEFAULT_JAEGER_UI),
)
try:
yield session
finally:
# Flush any buffered spans before the process exits.
provider.shutdown()


def _test_span_attributes(request: pytest.FixtureRequest) -> dict[str, str]:
"""Derive span attributes (sdk, container) from the test's parametrization."""
attrs: dict[str, str] = {"test.name": request.node.name}
callspec = getattr(request.node, "callspec", None)
if callspec is None:
return attrs
params = callspec.params
for key in ("encrypt_sdk", "decrypt_sdk", "sdk"):
if key in params:
attrs["test.sdk"] = str(params[key])
break
if "container" in params:
attrs["test.container"] = str(params["container"])
return attrs


@pytest.fixture(autouse=True)
def _trace_test(
request: pytest.FixtureRequest, _tracing: TracingSession | None
) -> Iterator[None]:
"""Wrap each test in a ``pytest.test`` span and propagate it to subprocesses.

Sets ``TRACEPARENT`` in the environment for the duration of the test so the
SDK CLI (invoked via ``subprocess.run`` in ``tdfs.py``, which copies
``os.environ``) starts a child span under this one. Prints the Jaeger trace
URL on failure.
"""
if _tracing is None:
yield
return

from opentelemetry import trace
from opentelemetry.propagate import inject

tracer = _tracing.tracer
with tracer.start_as_current_span("pytest.test") as span: # type: ignore[attr-defined]

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

With tracer typed as Any, the # type: ignore[attr-defined] is no longer necessary and can be safely removed.

Suggested change
with tracer.start_as_current_span("pytest.test") as span: # type: ignore[attr-defined]
with tracer.start_as_current_span("pytest.test") as span:

for key, value in _test_span_attributes(request).items():
span.set_attribute(key, value)

# Export the active context so child processes continue this trace.
carrier: dict[str, str] = {}
inject(carrier)
prev_traceparent = os.environ.get("TRACEPARENT")
if "traceparent" in carrier:
os.environ["TRACEPARENT"] = carrier["traceparent"]

trace_id = format(span.get_span_context().trace_id, "032x")
trace_url = f"{_tracing.jaeger_ui_url}/trace/{trace_id}"
try:
yield
finally:
# Restore prior TRACEPARENT to avoid leaking across tests.
if prev_traceparent is None:
os.environ.pop("TRACEPARENT", None)
else:
os.environ["TRACEPARENT"] = prev_traceparent

rep = getattr(request.node, "rep_call", None)
if rep is not None and rep.failed:
span.set_status(trace.Status(trace.StatusCode.ERROR))
print(f"\nTrace: {trace_url}")
Comment on lines +139 to +142

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Currently, only rep_call is checked to determine if the test failed. However, integration tests can also fail during the setup or teardown phases (e.g., if a fixture fails to initialize or clean up). We should check all three phases (setup, call, teardown) to ensure the span status is correctly set to ERROR and the trace URL is printed for any failure.

Suggested change
rep = getattr(request.node, "rep_call", None)
if rep is not None and rep.failed:
span.set_status(trace.Status(trace.StatusCode.ERROR))
print(f"\nTrace: {trace_url}")
failed = False
for phase in ("setup", "call", "teardown"):
rep = getattr(request.node, f"rep_{phase}", None)
if rep is not None and rep.failed:
failed = True
break
if failed:
span.set_status(trace.Status(trace.StatusCode.ERROR))
print(f"\nTrace: {trace_url}")

3 changes: 3 additions & 0 deletions xtest/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ dependencies = [
"jsonschema>=4.25.1",
"jsonschema-specifications>=2025.9.1",
"MarkupSafe>=3.0.3",
"opentelemetry-api>=1.29.0",
"opentelemetry-sdk>=1.29.0",
"opentelemetry-exporter-otlp-proto-grpc>=1.29.0",
"packaging>=26.2",
"pluggy>=1.6.0",
"pycparser>=3.0",
Expand Down
Loading
Loading