From 1e76affa1d7f7d32d08fb606044516e061cee831 Mon Sep 17 00:00:00 2001 From: Evgeny Formanenko Date: Fri, 3 Jul 2026 18:56:43 +0300 Subject: [PATCH] feat(tools): read-only access for centaur agents to Grafana, k8s, GCP logs, Linear MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire five infra systems into the sandbox tool surface. Two ride existing tools (grafana, linear); three are new read-only tools shipped in this fork and layered over upstream via overlays.sources. - overlay: add subsquid/centaur as a tool overlay source over paradigmxyz/centaur - grafana: allow the self-hosted host grafana.infra.gc.subsquid.io; inject GRAFANA_URL via sandbox.extraEnv (merged into SESSION_SANDBOX_EXTRA_ENV without dropping the overlay skill dirs) - k8s: new read-only tool for GKE main + sqd-compute-cluster (inject-mode Bearer per cluster, view-scoped RBAC, CA bundles shipped in-tool); RBAC manifest under contrib/k8s-readonly - gcp-logs: new read-only tool over Cloud Logging using iron-proxy's native gcp_auth transform (SA key -> access token, logging.viewer scope) - docs: contrib/ACCESS_SETUP.md — 1Password items, tokens, RBAC, GCP SA, rollout Secrets resolve from 1Password as op:////credential. No api-rs or console changes. Co-Authored-By: Claude Opus 4.8 (1M context) --- contrib/ACCESS_SETUP.md | 134 +++++++++++++++++ contrib/chart/values.mo4islona.yaml | 32 ++++ contrib/k8s-readonly/readonly-sa.yaml | 51 +++++++ tools/infra/gcp-logs/.env.example | 6 + tools/infra/gcp-logs/cli.py | 68 +++++++++ tools/infra/gcp-logs/client.py | 138 +++++++++++++++++ tools/infra/gcp-logs/pyproject.toml | 38 +++++ tools/infra/grafana/pyproject.toml | 5 +- tools/infra/k8s/.env.example | 13 ++ tools/infra/k8s/ca/gke.pem | 25 ++++ tools/infra/k8s/ca/sqd.pem | 19 +++ tools/infra/k8s/cli.py | 150 +++++++++++++++++++ tools/infra/k8s/client.py | 208 ++++++++++++++++++++++++++ tools/infra/k8s/pyproject.toml | 45 ++++++ 14 files changed, 931 insertions(+), 1 deletion(-) create mode 100644 contrib/ACCESS_SETUP.md create mode 100644 contrib/k8s-readonly/readonly-sa.yaml create mode 100644 tools/infra/gcp-logs/.env.example create mode 100644 tools/infra/gcp-logs/cli.py create mode 100644 tools/infra/gcp-logs/client.py create mode 100644 tools/infra/gcp-logs/pyproject.toml create mode 100644 tools/infra/k8s/.env.example create mode 100644 tools/infra/k8s/ca/gke.pem create mode 100644 tools/infra/k8s/ca/sqd.pem create mode 100644 tools/infra/k8s/cli.py create mode 100644 tools/infra/k8s/client.py create mode 100644 tools/infra/k8s/pyproject.toml diff --git a/contrib/ACCESS_SETUP.md b/contrib/ACCESS_SETUP.md new file mode 100644 index 000000000..9cd3b64eb --- /dev/null +++ b/contrib/ACCESS_SETUP.md @@ -0,0 +1,134 @@ +# Centaur access setup — Grafana, GKE, sqd-compute-cluster, GCP logs, Linear + +Wiring so **centaur agents** (not you locally) can reach these five systems. +Two of them ride existing tools (`grafana`, `linear`); three are new read-only +tools shipped in this fork (`k8s`, `gcp-logs`). Secrets are resolved by iron-proxy +from 1Password as `op:////credential`. + +- **1Password vault:** `nlk6gbcqu6ddb43laiodtuntta` +- **Every secret item:** title = the secret name, one field literally named + `credential`. + +The code side (this fork) is already done: +- `overlays.sources` adds `subsquid/centaur` on top of upstream (`values.mo4islona.yaml`). +- `sandbox.extraEnv` injects `GRAFANA_URL`, the `KUBE_*` cluster config, and + `GCP_LOGGING_PROJECT` (`values.mo4islona.yaml`); the chart merges these into the + sandbox env without dropping the overlay skill dirs. +- `tools/infra/grafana` host allowlist now includes `grafana.infra.gc.subsquid.io`. +- `tools/infra/k8s` and `tools/infra/gcp-logs` are the new read-only tools. +- `contrib/k8s-readonly/readonly-sa.yaml` is the per-cluster RBAC. + +What remains is the parts only you can do: create the tokens, drop them into +1Password, apply RBAC, and roll the deploy. + +--- + +## 0. Prereq — repo-cache must read the private fork + +The overlay source `subsquid/centaur` is private, so repo-cache needs a GitHub +token with read access. Either seed `GITHUB_TOKEN` via the bootstrap script or +set `repoCache.githubToken.existingSecretName`. Without it the overlay silently +falls back to upstream-only and the new tools never appear. + +Also **push this branch** (`feat/per-user-tenancy`) to `subsquid/centaur` — the +`overlays.sources` ref points at it. (Bump the ref once it merges to `main`.) + +--- + +## 1. Linear (existing tool — secret only) + +1. Get a Linear API key (Linear → Settings → API → Personal API keys), scoped + read/write as you want the agent to act. +2. Store it: + ```bash + op item create --vault nlk6gbcqu6ddb43laiodtuntta --category "API Credential" \ + --title LINEAR_API_KEY credential="lin_api_xxx" + ``` + +## 2. Grafana (existing tool — self-hosted, host override already in the fork) + +1. In `https://grafana.infra.gc.subsquid.io` → Administration → Service accounts, + create a **Viewer** service account and a token. +2. Store it: + ```bash + op item create --vault nlk6gbcqu6ddb43laiodtuntta --category "API Credential" \ + --title GRAFANA_API_KEY credential="glsa_xxx" + ``` + `GRAFANA_URL` is already injected by the chart (not a secret). + +## 3. GKE + sqd-compute-cluster (new `k8s` tool) + +Read-only SA bound to the built-in `view` ClusterRole (excludes Secrets). + +```bash +# Apply RBAC in BOTH clusters +kubectl --context gke_bright-meridian-316511_europe-west3_main \ + -f contrib/k8s-readonly/readonly-sa.yaml apply +kubectl --context sqd-compute-cluster \ + -f contrib/k8s-readonly/readonly-sa.yaml apply + +# Extract each token and store it +GKE_TOKEN=$(kubectl --context gke_bright-meridian-316511_europe-west3_main \ + -n kube-system get secret centaur-readonly-token -o jsonpath='{.data.token}' | base64 -d) +op item create --vault nlk6gbcqu6ddb43laiodtuntta --category "API Credential" \ + --title K8S_GKE_TOKEN credential="$GKE_TOKEN" + +SQD_TOKEN=$(kubectl --context sqd-compute-cluster \ + -n kube-system get secret centaur-readonly-token -o jsonpath='{.data.token}' | base64 -d) +op item create --vault nlk6gbcqu6ddb43laiodtuntta --category "API Credential" \ + --title K8S_SQD_TOKEN credential="$SQD_TOKEN" +``` + +> **Reachability to verify after deploy:** the GKE API server may have *master +> authorized networks* that reject the sandbox egress IP, and the sqd endpoint is +> exposed on `:6443` — confirm the iron-proxy host rule matches host **and** port +> (adjust the `162.19.107.87:6443` entry in `tools/infra/k8s/pyproject.toml` +> `[tool.centaur].hosts` if a CONNECT to sqd is denied). + +## 4. GCP logs (new `gcp-logs` tool) + +iron-proxy's `gcp_auth` transform mints the access token from a SA key — the tool +carries no creds. + +```bash +PROJECT=bright-meridian-316511 +gcloud iam service-accounts create centaur-logs-reader \ + --project "$PROJECT" --display-name "Centaur read-only logs" +gcloud projects add-iam-policy-binding "$PROJECT" \ + --member "serviceAccount:centaur-logs-reader@${PROJECT}.iam.gserviceaccount.com" \ + --role roles/logging.viewer +gcloud iam service-accounts keys create /tmp/centaur-logs-sa.json \ + --iam-account "centaur-logs-reader@${PROJECT}.iam.gserviceaccount.com" + +# Store the WHOLE JSON key as the credential +op item create --vault nlk6gbcqu6ddb43laiodtuntta --category "API Credential" \ + --title GCP_LOGGING_SA credential="$(cat /tmp/centaur-logs-sa.json)" +rm -f /tmp/centaur-logs-sa.json +``` + +--- + +## 5. Roll the deploy + +```bash +helm upgrade --install centaur contrib/chart -n centaur --create-namespace \ + -f contrib/chart/values.dev.yaml \ + -f contrib/chart/values.mo4islona.yaml \ + -f contrib/chart/values.mo4islona-pool.yaml +``` + +## 6. Verify from an agent + +``` +call tools # k8s, gcp-logs, grafana, linear all listed +call discover k8s +call k8s clusters +call k8s pods gke centaur +call gcp-logs read 'resource.type="k8s_container"' --freshness 30m +call grafana search +call linear ... +``` + +If a tool is missing: check repo-cache synced the overlay (Prereq 0). If a call +gets a 401/405 through the proxy: the host isn't in the tool's allowlist or the +1Password item name/field is wrong. diff --git a/contrib/chart/values.mo4islona.yaml b/contrib/chart/values.mo4islona.yaml index be2edc15e..1caa579be 100644 --- a/contrib/chart/values.mo4islona.yaml +++ b/contrib/chart/values.mo4islona.yaml @@ -27,6 +27,22 @@ sandbox: image: repository: docker.io/mo4islona/centaur-agent pullPolicy: Always + # Non-secret tool config injected into every sandbox. The chart merges these + # into SESSION_SANDBOX_EXTRA_ENV alongside the overlay skill dirs (it does NOT + # clobber them — set them here, not in apiRs.extraEnv). Bearer tokens are NOT + # here: those come from iron-proxy / 1Password. These are plain values the tool + # clients read via os.getenv. + extraEnv: + # grafana tool (self-hosted base URL) + GRAFANA_URL: https://grafana.infra.gc.subsquid.io + # k8s tool: per-cluster API endpoints + labels (CA bundles ship in the tool) + KUBE_CLUSTERS: gke,sqd + KUBE_GKE_LABEL: GKE main (bright-meridian-316511 / europe-west3) + KUBE_GKE_SERVER: https://35.246.168.135 + KUBE_SQD_LABEL: sqd-compute-cluster (self-managed) + KUBE_SQD_SERVER: https://162.19.107.87:6443 + # gcp-logs tool: default project + GCP_LOGGING_PROJECT: bright-meridian-316511 ironProxy: image: @@ -37,3 +53,19 @@ console: image: repository: docker.io/mo4islona/centaur-console pullPolicy: Always + +# Tool delivery. Base tools come from upstream paradigmxyz/centaur; this fork is +# layered on top as an overlay so its infra tools (k8s, gcp-logs) and its +# self-hosted grafana host override SHADOW the upstream copies (later source +# wins on name collision). repo-cache syncs both. +# +# PREREQ: subsquid/centaur is private, so repo-cache needs a GitHub token with +# read access — set repoCache.githubToken.existingSecretName (or seed GITHUB_TOKEN +# via bootstrap-k8s-secrets.sh). `ref` must point at a branch/tag that carries +# tools/infra/{k8s,gcp-logs} + the grafana host override (this branch until merged). +overlays: + sources: + - repo: paradigmxyz/centaur + ref: "" + - repo: subsquid/centaur + ref: feat/per-user-tenancy diff --git a/contrib/k8s-readonly/readonly-sa.yaml b/contrib/k8s-readonly/readonly-sa.yaml new file mode 100644 index 000000000..bd1206041 --- /dev/null +++ b/contrib/k8s-readonly/readonly-sa.yaml @@ -0,0 +1,51 @@ +# Read-only ServiceAccount for the centaur `k8s` tool. +# +# Apply once per cluster (GKE main + sqd-compute-cluster). Binds a dedicated SA +# to the built-in `view` ClusterRole (read-only; notably excludes Secrets), then +# mints a long-lived token the iron-proxy injects as the cluster's bearer. +# +# kubectl --context gke_bright-meridian-316511_europe-west3_main -f contrib/k8s-readonly/readonly-sa.yaml apply +# kubectl --context sqd-compute-cluster -f contrib/k8s-readonly/readonly-sa.yaml apply +# +# Then extract the token (goes into 1Password as K8S_GKE_TOKEN / K8S_SQD_TOKEN): +# kubectl --context -n kube-system get secret centaur-readonly-token \ +# -o jsonpath='{.data.token}' | base64 -d +apiVersion: v1 +kind: ServiceAccount +metadata: + name: centaur-readonly + namespace: kube-system + labels: + app.kubernetes.io/managed-by: centaur + app.kubernetes.io/part-of: centaur-tools +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: centaur-readonly-view + labels: + app.kubernetes.io/managed-by: centaur + app.kubernetes.io/part-of: centaur-tools +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: view +subjects: + - kind: ServiceAccount + name: centaur-readonly + namespace: kube-system +--- +# Long-lived (non-expiring) token for the SA. Modern clusters issue short-lived +# tokens via TokenRequest; this legacy Secret form gives a static bearer suitable +# for storing in 1Password. Rotate by deleting + re-creating this Secret. +apiVersion: v1 +kind: Secret +metadata: + name: centaur-readonly-token + namespace: kube-system + annotations: + kubernetes.io/service-account.name: centaur-readonly + labels: + app.kubernetes.io/managed-by: centaur + app.kubernetes.io/part-of: centaur-tools +type: kubernetes.io/service-account-token diff --git a/tools/infra/gcp-logs/.env.example b/tools/infra/gcp-logs/.env.example new file mode 100644 index 000000000..2de459c32 --- /dev/null +++ b/tools/infra/gcp-logs/.env.example @@ -0,0 +1,6 @@ +# Local dev for the gcp-logs tool. In production the access token is minted by +# iron-proxy's gcp_auth transform from the GCP_LOGGING_SA key in 1Password. +GCP_LOGGING_PROJECT=bright-meridian-316511 +# Local-only: path to a service-account key JSON to auth directly (prod uses the +# proxy). Not read by the tool itself — use with `gcloud auth` locally. +# GOOGLE_APPLICATION_CREDENTIALS=/path/to/sa.json diff --git a/tools/infra/gcp-logs/cli.py b/tools/infra/gcp-logs/cli.py new file mode 100644 index 000000000..c83fc7f29 --- /dev/null +++ b/tools/infra/gcp-logs/cli.py @@ -0,0 +1,68 @@ +"""CLI for read-only Google Cloud Logging.""" + +import json + +import typer +from dotenv import load_dotenv +from rich.console import Console + +from centaur_sdk import Table + +load_dotenv() + +app = typer.Typer(name="gcp-logs", help="Read-only Google Cloud Logging queries") +console = Console() + + +def get_client(project: str | None = None): + from .client import GcpLogsClient + + return GcpLogsClient(project) + + +@app.command("read") +def read( + filter_expr: str = typer.Argument(None, help="Cloud Logging filter expression"), + freshness: str = typer.Option("1h", "--freshness", "-f", help="Time window, e.g. 30m, 1h, 2d"), + limit: int = typer.Option(50, "--limit", "-n", help="Max entries"), + order: str = typer.Option("desc", "--order", help="Sort by timestamp: asc | desc"), + project: str = typer.Option(None, "--project", "-p", help="GCP project id"), + json_output: bool = typer.Option(False, "--json", help="Output as JSON"), +): + """Query log entries. Example filter: 'resource.type="k8s_container" AND severity>=ERROR'.""" + rows = get_client(project).read(filter_expr, freshness, limit, order) + if json_output: + print(json.dumps(rows, indent=2, default=str)) + return + if not rows: + console.print("[yellow]No log entries[/]") + return + table = Table(title="Log entries") + for col in ("Timestamp", "Severity", "Resource", "Log", "Payload"): + table.add_column(col) + for r in rows: + payload = r["payload"] + if not isinstance(payload, str): + payload = json.dumps(payload, default=str) + table.add_row( + str(r["timestamp"]), + str(r["severity"]), + str(r["resource"]), + str(r["log"]), + payload[:200], + ) + console.print(table) + + +@app.command("logs") +def logs( + limit: int = typer.Option(200, "--limit", "-n", help="Max log names"), + project: str = typer.Option(None, "--project", "-p", help="GCP project id"), +): + """List log names present in the project.""" + for name in get_client(project).logs(limit): + print(name) + + +if __name__ == "__main__": + app() diff --git a/tools/infra/gcp-logs/client.py b/tools/infra/gcp-logs/client.py new file mode 100644 index 000000000..4171fb597 --- /dev/null +++ b/tools/infra/gcp-logs/client.py @@ -0,0 +1,138 @@ +"""Read-only Google Cloud Logging client. + +The tool sends no credentials of its own. iron-proxy's ``gcp_auth`` transform +mints a short-lived access token from the ``GCP_LOGGING_SA`` service-account key +(scope ``logging.read``) and injects it as the ``Authorization`` header on +requests to ``logging.googleapis.com`` (see pyproject ``[tool.centaur]``). The SA +carries only ``roles/logging.viewer``, so every call is read-only. + +Project defaults to ``bright-meridian-316511`` and can be overridden with the +``GCP_LOGGING_PROJECT`` env var (plain, non-secret). +""" + +from __future__ import annotations + +import os +from typing import Any + +import httpx + +_API = "https://logging.googleapis.com/v2" +_DEFAULT_PROJECT = "bright-meridian-316511" + + +def default_project() -> str: + return os.getenv("GCP_LOGGING_PROJECT", _DEFAULT_PROJECT) # noqa: TID251 + + +class GcpLogsError(RuntimeError): + """A Cloud Logging API call failed.""" + + +class GcpLogsClient: + """Client for the Cloud Logging v2 read API.""" + + def __init__(self, project: str | None = None, timeout: float = 60.0): + self.project = project or default_project() + self.timeout = timeout + self._client: httpx.Client | None = None + + @property + def client(self) -> httpx.Client: + if self._client is None: + # No Authorization header here: iron-proxy's gcp_auth transform injects + # the bearer for logging.googleapis.com. Locally (no proxy) calls 401. + self._client = httpx.Client( + base_url=_API, + headers={"Content-Type": "application/json"}, + timeout=self.timeout, + ) + return self._client + + def _post(self, path: str, body: dict[str, Any]) -> dict[str, Any]: + resp = self.client.post(path, json=body) + if resp.status_code >= 400: + raise GcpLogsError(f"POST {path} -> {resp.status_code}: {resp.text[:800]}") + return resp.json() + + def read( + self, + filter_expr: str | None = None, + freshness: str | None = "1h", + limit: int = 50, + order: str = "desc", + ) -> list[dict[str, Any]]: + """Query log entries. + + ``filter_expr`` is a Cloud Logging filter + (https://cloud.google.com/logging/docs/view/logging-query-language). + ``freshness`` (e.g. "1h", "30m", "2d") is appended as a timestamp bound + unless the filter already constrains timestamp. Returns newest first by + default. + """ + filters = [f for f in [filter_expr] if f] + if freshness and "timestamp" not in (filter_expr or ""): + filters.append(f'timestamp >= "{self._freshness_to_rfc3339(freshness)}"') + body: dict[str, Any] = { + "resourceNames": [f"projects/{self.project}"], + "pageSize": min(max(limit, 1), 1000), + "orderBy": f"timestamp {order}", + } + if filters: + body["filter"] = " AND ".join(filters) + entries = self._post("/entries:list", body).get("entries", []) + return [self._summarize(e) for e in entries] + + def logs(self, limit: int = 200) -> list[str]: + """List the log names present in the project.""" + resp = self.client.get( + f"/projects/{self.project}/logs", + params={"pageSize": min(max(limit, 1), 1000)}, + ) + if resp.status_code >= 400: + raise GcpLogsError(f"list logs -> {resp.status_code}: {resp.text[:800]}") + return resp.json().get("logNames", []) + + @staticmethod + def _summarize(entry: dict[str, Any]) -> dict[str, Any]: + payload = ( + entry.get("textPayload") + or entry.get("jsonPayload") + or entry.get("protoPayload") + or "" + ) + return { + "timestamp": entry.get("timestamp"), + "severity": entry.get("severity"), + "resource": entry.get("resource", {}).get("type"), + "log": (entry.get("logName", "").split("/logs/", 1) or [""])[-1], + "payload": payload, + } + + @staticmethod + def _freshness_to_rfc3339(freshness: str) -> str: + """Convert a relative window (e.g. "1h") to an absolute RFC3339 bound. + + Cloud Logging filters only accept absolute timestamps, so we resolve the + window against the current UTC time (trusted in-sandbox). + """ + return _relative_bound(freshness) + + +def _relative_bound(freshness: str) -> str: + """Best-effort relative timestamp bound. + + We import time lazily and compute an absolute RFC3339 string; this runs + in-sandbox where the system clock is trusted. + """ + import datetime as _dt + + units = {"s": 1, "m": 60, "h": 3600, "d": 86400} + try: + value, unit = int(freshness[:-1]), freshness[-1] + seconds = value * units[unit] + except (ValueError, KeyError, IndexError) as exc: + raise GcpLogsError(f"bad freshness {freshness!r}; use e.g. 30m, 1h, 2d") from exc + now = _dt.datetime.now(_dt.timezone.utc) + bound = now - _dt.timedelta(seconds=seconds) + return bound.strftime("%Y-%m-%dT%H:%M:%SZ") diff --git a/tools/infra/gcp-logs/pyproject.toml b/tools/infra/gcp-logs/pyproject.toml new file mode 100644 index 000000000..6c0a6dbc6 --- /dev/null +++ b/tools/infra/gcp-logs/pyproject.toml @@ -0,0 +1,38 @@ +[project] +name = "gcp-logs" +description = "Read-only Google Cloud Logging — query log entries and list logs in the bright-meridian-316511 project" +version = "0.1.0" +requires-python = ">=3.11" +dependencies = [ + "httpx>=0.27.0", + "typer>=0.12.0", + "rich>=13.0.0", + "python-dotenv>=1.0.0", +] + +[project.scripts] +gcp-logs = "centaur_tool_gcp_logs.cli:app" + +[tool.hatch.build.targets.wheel] +packages = ["."] + +[tool.hatch.build.targets.wheel.sources] +"." = "centaur_tool_gcp_logs" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + + +[tool.centaur] +module = "client.py" +# Egress allowlist: the Cloud Logging API host. +hosts = ["logging.googleapis.com"] +# iron-proxy's `gcp_auth` transform reads the service-account JSON key from +# op:///GCP_LOGGING_SA/credential, mints a short-lived OAuth access token +# for the declared scope, and injects it as `Authorization: Bearer` on requests +# to logging.googleapis.com. The SA needs only roles/logging.viewer, so the grant +# is read-only. The tool ships no credentials of its own. +secrets = [ + { type = "gcp_auth", name = "GCP_LOGGING_SA", hosts = ["logging.googleapis.com"], scopes = ["https://www.googleapis.com/auth/logging.read"] }, +] diff --git a/tools/infra/grafana/pyproject.toml b/tools/infra/grafana/pyproject.toml index ee2717d2a..7b9cc7c44 100644 --- a/tools/infra/grafana/pyproject.toml +++ b/tools/infra/grafana/pyproject.toml @@ -32,7 +32,10 @@ module = "client.py" # iron-proxy `secrets` transform with no host rules, so the token is never # injected AND egress to Grafana is denied (every call 405s/401s through the # CONNECT-only proxy). -hosts = ["*.grafana.net"] +# subsquid overlay: this deployment's Grafana is self-hosted, so the +# GRAFANA_API_KEY Bearer is scoped to (and egress allowed for) +# grafana.infra.gc.subsquid.io. The *.grafana.net entry stays for Grafana Cloud. +hosts = ["*.grafana.net", "grafana.infra.gc.subsquid.io"] secrets = [ # NOTE: GRAFANA_URL is NOT a secret — it's the base URL the client reads # in-process (os.getenv) to pick its connection target, so it can never be diff --git a/tools/infra/k8s/.env.example b/tools/infra/k8s/.env.example new file mode 100644 index 000000000..4e570b8c5 --- /dev/null +++ b/tools/infra/k8s/.env.example @@ -0,0 +1,13 @@ +# Local dev for the k8s tool. In production these are injected by the chart +# (SESSION_SANDBOX_EXTRA_ENV) and the bearer tokens come from iron-proxy / 1Password. +KUBE_CLUSTERS=gke,sqd +KUBE_GKE_SERVER=https://35.246.168.135 +KUBE_GKE_LABEL=GKE main +KUBE_SQD_SERVER=https://162.19.107.87:6443 +KUBE_SQD_LABEL=sqd-compute-cluster +# Local-only: real SA bearer tokens (in prod iron-proxy injects these). +K8S_GKE_TOKEN= +K8S_SQD_TOKEN= +# Optional CA override (base64 PEM); defaults to the packaged ca/.pem. +KUBE_GKE_CA_B64= +KUBE_SQD_CA_B64= diff --git a/tools/infra/k8s/ca/gke.pem b/tools/infra/k8s/ca/gke.pem new file mode 100644 index 000000000..7b4078025 --- /dev/null +++ b/tools/infra/k8s/ca/gke.pem @@ -0,0 +1,25 @@ +-----BEGIN CERTIFICATE----- +MIIELTCCApWgAwIBAgIRAN7OQG4HH4Iqr9h65o+AdPgwDQYJKoZIhvcNAQELBQAw +LzEtMCsGA1UEAxMkMzAwZmQ4OTItM2NiYS00YjMzLTk1M2MtMzc5NjhjOGI4MTI0 +MCAXDTI2MDUyNzEzMzkyOFoYDzIwNTYwNTE5MTQzOTI4WjAvMS0wKwYDVQQDEyQz +MDBmZDg5Mi0zY2JhLTRiMzMtOTUzYy0zNzk2OGM4YjgxMjQwggGiMA0GCSqGSIb3 +DQEBAQUAA4IBjwAwggGKAoIBgQDLru2A0AdOJXxGi0l+xZahbblFIQuzE5b+13oz +QkHPAi+c6a2LFZL4PCmAKyzNil7iUSLfeX5XxJIpm1E0/FBJ8i8dEAbyg4vlDOCv +cBg++O+ONhIbeHl+tct6AtDmO+saaIHSvdANM970gO1qfqFnKO1MNumjtA0qFZyD +rhcfKtdvXAlO8wHKgxIKwuyD1I2pYSx3YVKLdjq7Pd8caQFVMgWKw8S1f/8i0pJ2 +BWV5mUdlTC4Bu+65txV5iprCXiz9aR3d7AnreHohzZI87EJaVIRQbGJg9a+sFuUK +HR55A144Sm3RJiNwoyr2arO/HCgvNuxe0wT+1D1Kf/QN2kLiEKX9rS8XTdqHa/XN +GdtCf8mUg07XIh7+ZLmN9xnDl3Gvjn/aU8ocA1rV/uYAVmU2C2xFMboZPNT6YKfC +f+aQKgXmZY+GxMmhSDdTNRboUeJ4SuCtcN4r4oPHvjiJWFKOH2Un8WWJsFGB8YFC +witr4JCCLxJm16LwzHSe7y65JMUCAwEAAaNCMEAwDgYDVR0PAQH/BAQDAgIEMA8G +A1UdEwEB/wQFMAMBAf8wHQYDVR0OBBYEFD2KNN6hsiR7MWT9JXRHpzs1PBeGMA0G +CSqGSIb3DQEBCwUAA4IBgQBqCm3YSfTYwlHChwR/Pro2BscEu2suAGtV4p2UvVjf +i7gyeqFQx1JYQ9f7GmbdELw33eIudI71bOwqqwzrxpfY7j/StqOpbmCsRaSnoy5h +XE2LxTDI5bmNk5IHgSZwpHS1q2JNGgC0M06qnVVJ0BF4NoQRtpyu0UPH29+3S9d0 +MXiRLR4b8G2fAoLfXNitYs5VzpUFUBw0/LpzeIZ+RsDd6o/x+uFUYmRFMbImoz9f +ciZlyplIGHLGXA8dmKq/WVy0U6pr+V5cM4jnuRILjM51mMr+D2PHmUrZs422RH1x +56mJJrR23y4jqlyA4SsLQsTkzIZMtnoFz3fbPSawZ+Vbkca9sFm7u7C8MntfSETl +tPogs+z8aIUPsODNqwQcLoIJsKmOyJQvei6WwuyTEVxK3lQMsTuQRIrgMwZn1nOn +Xe7IaC1vNahlUnqYT4GQSOd0o6uGgTvh9owrHbvWgm1dxdZudRPgVS5H8iwgD+8W +qyq0yFIZdgW5puV8Pf2ts70= +-----END CERTIFICATE----- diff --git a/tools/infra/k8s/ca/sqd.pem b/tools/infra/k8s/ca/sqd.pem new file mode 100644 index 000000000..8eae0fed8 --- /dev/null +++ b/tools/infra/k8s/ca/sqd.pem @@ -0,0 +1,19 @@ +-----BEGIN CERTIFICATE----- +MIIDBTCCAe2gAwIBAgIIKJtKcYwc3KkwDQYJKoZIhvcNAQELBQAwFTETMBEGA1UE +AxMKa3ViZXJuZXRlczAeFw0yNDA3MjMwNDM1MDFaFw0zNDA3MjEwNDQwMDFaMBUx +EzARBgNVBAMTCmt1YmVybmV0ZXMwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEK +AoIBAQCrSbZOVek0z1lhBjIiayU2kbg6W3udaaX7fvV0qkgeRSuased1o0NLWhx8 +CjtskVkRtoNfbraGftEXM/aR/RAfUmmX+tq5pkKt/x6Dq4cUS4QMLIh8ZpqDw9x0 +2qEiLx2rEjHxfJWIPUyQzrzCCSrz4bCWh1xXQSekahvkIl3PnlPSgDpZuhuway3T +dGoOfihgocQj8PCYakH5x6yTQhjw1H9eWLld+tA0Jh7C9/34AczoJsjTCeesoB1n +wDR5r6CVZcxb03gXW8XGHAnj6KaEXYDTYRONgKFJlWvP3jnWBRK3TQ5t83Zsybg/ +LQuRwNRVRzhl51Jj+YU7vPFGky+ZAgMBAAGjWTBXMA4GA1UdDwEB/wQEAwICpDAP +BgNVHRMBAf8EBTADAQH/MB0GA1UdDgQWBBQOAKAzFZyexJw10dscCg18R/NqDjAV +BgNVHREEDjAMggprdWJlcm5ldGVzMA0GCSqGSIb3DQEBCwUAA4IBAQBP9Ba8HZsI +U0yExjJQsEiEeNuh2StlHtFpVp7Om+byemvgVh9Llf/5gIyB1FTMsuz3dQLsDt5k +DqLWxdCx3So+DLgx9p7WTh84BBEgHB0G5wrbpnGsd4s18Odu++SFujhuhilm1oLE +fvlIW1aJcbEXCYhsRkcPL9vBRRUlD6iqG1ntmYQ/vxh8d9JYKofsrnqVTBht5i5i +tI/LRsZdFR7RSTUSTNZcTjWW5k6TGjVWbdQ4m7o/xoK4XDoUSBYjhuVPHGcWBzP7 +ll9/wMgjJQ6dHWBVTNgm/2a7+1qPTlgp9//31X/fly6qrjmEynvy2q+5iF/GC53k +ikIrPeAFbuZE +-----END CERTIFICATE----- diff --git a/tools/infra/k8s/cli.py b/tools/infra/k8s/cli.py new file mode 100644 index 000000000..96d5550cf --- /dev/null +++ b/tools/infra/k8s/cli.py @@ -0,0 +1,150 @@ +"""CLI for read-only Kubernetes access (GKE main + sqd-compute-cluster).""" + +import json + +import typer +from dotenv import load_dotenv +from rich.console import Console + +from centaur_sdk import Table + +load_dotenv() + +app = typer.Typer(name="k8s", help="Read-only Kubernetes access to the GKE main and sqd-compute clusters") +console = Console() + + +def get_client(cluster: str): + from .client import KubeClient + + return KubeClient(cluster) + + +@app.command("clusters") +def clusters(json_output: bool = typer.Option(False, "--json", help="Output as JSON")): + """List the clusters this deployment can reach.""" + from .client import cluster_label, configured_clusters + + rows = [{"key": c, "label": cluster_label(c)} for c in configured_clusters()] + if json_output: + print(json.dumps(rows, indent=2)) + return + table = Table(title="Clusters") + table.add_column("Key", style="cyan") + table.add_column("Label", style="white") + for r in rows: + table.add_row(r["key"], r["label"]) + console.print(table) + + +@app.command("namespaces") +def namespaces( + cluster: str = typer.Argument(..., help="Cluster key (e.g. gke, sqd)"), + json_output: bool = typer.Option(False, "--json", help="Output as JSON"), +): + """List namespaces.""" + rows = get_client(cluster).namespaces() + if json_output: + print(json.dumps(rows, indent=2)) + return + table = Table(title=f"Namespaces ({cluster})") + table.add_column("Name", style="cyan") + table.add_column("Status", style="green") + for r in rows: + table.add_row(str(r["name"]), str(r["status"])) + console.print(table) + + +@app.command("pods") +def pods( + cluster: str = typer.Argument(..., help="Cluster key (e.g. gke, sqd)"), + namespace: str = typer.Argument(..., help="Namespace"), + selector: str = typer.Option(None, "--selector", "-l", help="Label selector"), + json_output: bool = typer.Option(False, "--json", help="Output as JSON"), +): + """List pods in a namespace.""" + rows = get_client(cluster).pods(namespace, selector) + if json_output: + print(json.dumps(rows, indent=2)) + return + table = Table(title=f"Pods ({cluster}/{namespace})") + for col in ("Name", "Phase", "Ready", "Restarts", "Node"): + table.add_column(col) + for r in rows: + table.add_row(str(r["name"]), str(r["phase"]), str(r["ready"]), str(r["restarts"]), str(r["node"])) + console.print(table) + + +@app.command("logs") +def logs( + cluster: str = typer.Argument(..., help="Cluster key (e.g. gke, sqd)"), + namespace: str = typer.Argument(..., help="Namespace"), + pod: str = typer.Argument(..., help="Pod name"), + container: str = typer.Option(None, "--container", "-c", help="Container name"), + tail: int = typer.Option(200, "--tail", "-n", help="Tail N lines"), + previous: bool = typer.Option(False, "--previous", "-p", help="Previous (crashed) instance"), +): + """Fetch pod logs.""" + print(get_client(cluster).pod_logs(namespace, pod, container, tail, previous)) + + +@app.command("deployments") +def deployments( + cluster: str = typer.Argument(..., help="Cluster key (e.g. gke, sqd)"), + namespace: str = typer.Argument(..., help="Namespace"), + json_output: bool = typer.Option(False, "--json", help="Output as JSON"), +): + """List deployments in a namespace.""" + rows = get_client(cluster).deployments(namespace) + if json_output: + print(json.dumps(rows, indent=2)) + return + table = Table(title=f"Deployments ({cluster}/{namespace})") + for col in ("Name", "Ready", "Updated", "Available"): + table.add_column(col) + for r in rows: + table.add_row(str(r["name"]), str(r["ready"]), str(r["updated"]), str(r["available"])) + console.print(table) + + +@app.command("events") +def events( + cluster: str = typer.Argument(..., help="Cluster key (e.g. gke, sqd)"), + namespace: str = typer.Argument(..., help="Namespace"), + limit: int = typer.Option(50, "--limit", "-n", help="Max events"), + json_output: bool = typer.Option(False, "--json", help="Output as JSON"), +): + """List recent events in a namespace.""" + rows = get_client(cluster).events(namespace, limit) + if json_output: + print(json.dumps(rows, indent=2)) + return + table = Table(title=f"Events ({cluster}/{namespace})") + for col in ("Type", "Reason", "Object", "Message", "Last"): + table.add_column(col) + for r in rows: + table.add_row(str(r["type"]), str(r["reason"]), str(r["object"]), str(r["message"]), str(r["last"])) + console.print(table) + + +@app.command("nodes") +def nodes( + cluster: str = typer.Argument(..., help="Cluster key (e.g. gke, sqd)"), + json_output: bool = typer.Option(False, "--json", help="Output as JSON"), +): + """List nodes.""" + rows = get_client(cluster).nodes() + print(json.dumps(rows, indent=2)) + + +@app.command("get") +def raw_get( + cluster: str = typer.Argument(..., help="Cluster key (e.g. gke, sqd)"), + path: str = typer.Argument(..., help="API path, e.g. /api/v1/namespaces/centaur/pods"), +): + """Arbitrary read-only GET against the kube API (RBAC `view` scoped).""" + print(json.dumps(get_client(cluster).raw_get(path), indent=2)) + + +if __name__ == "__main__": + app() diff --git a/tools/infra/k8s/client.py b/tools/infra/k8s/client.py new file mode 100644 index 000000000..c4ad8faa4 --- /dev/null +++ b/tools/infra/k8s/client.py @@ -0,0 +1,208 @@ +"""Read-only Kubernetes API client for the GKE main and sqd-compute clusters. + +Authentication is handled by iron-proxy: each cluster's `K8S__TOKEN` +secret is injected as a `Authorization: Bearer` header on requests to that +cluster's API host (see pyproject `[tool.centaur].secrets`). The client sends a +placeholder the proxy overwrites, so the real ServiceAccount token never lands +in the sandbox. RBAC binds those tokens to the built-in `view` ClusterRole, so +every call is read-only by construction — this client also refuses non-GET verbs +as a second line of defence. + +Per-cluster wiring comes from plain (non-secret) env, injected via the chart's +`SESSION_SANDBOX_EXTRA_ENV`: + + KUBE_CLUSTERS comma list of cluster keys, e.g. "gke,sqd" + KUBE__SERVER https://host[:port] of the API server + KUBE__LABEL human label (optional) + KUBE__CA_B64 base64 CA bundle (optional; overrides the packaged one) + +The CA bundle for each cluster ships with the tool under ``ca/.pem``. +""" + +from __future__ import annotations + +import base64 +import os +import tempfile +from pathlib import Path +from typing import Any + +import httpx + +from centaur_sdk import secret + +_CA_DIR = Path(__file__).resolve().parent / "ca" +_DEFAULT_CLUSTERS = "gke,sqd" + + +def configured_clusters() -> list[str]: + """Cluster keys the deployment wired up, lower-cased.""" + raw = os.getenv("KUBE_CLUSTERS", _DEFAULT_CLUSTERS) # noqa: TID251 + return [c.strip().lower() for c in raw.split(",") if c.strip()] + + +def cluster_label(cluster: str) -> str: + return os.getenv(f"KUBE_{cluster.upper()}_LABEL", cluster) # noqa: TID251 + + +class KubeError(RuntimeError): + """A kube API call failed or the cluster is misconfigured.""" + + +class KubeClient: + """Read-only client for a single configured cluster.""" + + def __init__(self, cluster: str, timeout: float = 30.0): + self.cluster = cluster.strip().lower() + if self.cluster not in configured_clusters(): + known = ", ".join(configured_clusters()) or "" + raise KubeError(f"unknown cluster {cluster!r}; configured: {known}") + self.timeout = timeout + self._client: httpx.Client | None = None + self._ca_tmp: str | None = None + + @property + def server(self) -> str: + server = os.getenv(f"KUBE_{self.cluster.upper()}_SERVER", "") # noqa: TID251 + if not server: + raise KubeError( + f"KUBE_{self.cluster.upper()}_SERVER is not set; the deployment must " + f"inject the API endpoint for cluster {self.cluster!r}" + ) + return server.rstrip("/") + + def _verify(self) -> str | bool: + """Resolve the CA bundle: env override, then packaged file, else off.""" + b64 = os.getenv(f"KUBE_{self.cluster.upper()}_CA_B64", "") # noqa: TID251 + if b64: + fd, path = tempfile.mkstemp(suffix=".pem", prefix=f"kube-{self.cluster}-") + with os.fdopen(fd, "wb") as fh: + fh.write(base64.b64decode(b64)) + self._ca_tmp = path + return path + packaged = _CA_DIR / f"{self.cluster}.pem" + if packaged.is_file(): + return str(packaged) + # No CA on hand. Egress is already host-locked by iron-proxy and this is + # read-only, so fall back to unverified TLS rather than failing hard. + return False + + @property + def client(self) -> httpx.Client: + if self._client is None: + # inject-mode secret: iron-proxy overwrites Authorization with the real + # bearer for this host. The placeholder just keeps a header on the wire. + token = secret(f"K8S_{self.cluster.upper()}_TOKEN", "iron-proxy-injected") + self._client = httpx.Client( + base_url=self.server, + headers={"Authorization": f"Bearer {token}", "Accept": "application/json"}, + verify=self._verify(), + timeout=self.timeout, + ) + return self._client + + def _get(self, path: str, params: dict[str, Any] | None = None) -> dict[str, Any]: + if not path.startswith("/"): + path = "/" + path + resp = self.client.get(path, params={k: v for k, v in (params or {}).items() if v is not None}) + if resp.status_code >= 400: + raise KubeError(f"{self.cluster} GET {path} -> {resp.status_code}: {resp.text[:500]}") + ctype = resp.headers.get("content-type", "") + return resp.json() if ctype.startswith("application/json") else {"raw": resp.text} + + # -- read-only surface ------------------------------------------------- + + def namespaces(self) -> list[dict[str, Any]]: + items = self._get("/api/v1/namespaces").get("items", []) + return [{"name": i["metadata"]["name"], "status": i.get("status", {}).get("phase")} for i in items] + + def pods(self, namespace: str, label_selector: str | None = None) -> list[dict[str, Any]]: + data = self._get(f"/api/v1/namespaces/{namespace}/pods", {"labelSelector": label_selector}) + out = [] + for i in data.get("items", []): + st = i.get("status", {}) + cs = st.get("containerStatuses", []) or [] + ready = sum(1 for c in cs if c.get("ready")) + restarts = sum(c.get("restartCount", 0) for c in cs) + out.append( + { + "name": i["metadata"]["name"], + "phase": st.get("phase"), + "ready": f"{ready}/{len(cs)}", + "restarts": restarts, + "node": i.get("spec", {}).get("nodeName"), + } + ) + return out + + def pod_logs( + self, + namespace: str, + pod: str, + container: str | None = None, + tail_lines: int = 200, + previous: bool = False, + ) -> str: + resp = self.client.get( + f"/api/v1/namespaces/{namespace}/pods/{pod}/log", + params={ + k: v + for k, v in { + "container": container, + "tailLines": tail_lines, + "previous": "true" if previous else None, + }.items() + if v is not None + }, + ) + if resp.status_code >= 400: + raise KubeError(f"{self.cluster} logs {namespace}/{pod} -> {resp.status_code}: {resp.text[:500]}") + return resp.text + + def deployments(self, namespace: str) -> list[dict[str, Any]]: + data = self._get(f"/apis/apps/v1/namespaces/{namespace}/deployments") + out = [] + for i in data.get("items", []): + st = i.get("status", {}) + out.append( + { + "name": i["metadata"]["name"], + "ready": f"{st.get('readyReplicas', 0)}/{st.get('replicas', 0)}", + "updated": st.get("updatedReplicas", 0), + "available": st.get("availableReplicas", 0), + } + ) + return out + + def events(self, namespace: str, limit: int = 50) -> list[dict[str, Any]]: + data = self._get(f"/api/v1/namespaces/{namespace}/events", {"limit": limit}) + out = [] + for i in data.get("items", []): + out.append( + { + "type": i.get("type"), + "reason": i.get("reason"), + "object": f"{i.get('involvedObject', {}).get('kind')}/{i.get('involvedObject', {}).get('name')}", + "message": i.get("message"), + "last": i.get("lastTimestamp") or i.get("eventTime"), + } + ) + return out + + def nodes(self) -> list[dict[str, Any]]: + data = self._get("/api/v1/nodes") + out = [] + for i in data.get("items", []): + conds = {c["type"]: c["status"] for c in i.get("status", {}).get("conditions", [])} + out.append( + { + "name": i["metadata"]["name"], + "ready": conds.get("Ready"), + "labels": i.get("metadata", {}).get("labels", {}), + } + ) + return out + + def raw_get(self, path: str, params: dict[str, Any] | None = None) -> dict[str, Any]: + """Arbitrary read-only GET against the kube API (RBAC `view` scoped).""" + return self._get(path, params) diff --git a/tools/infra/k8s/pyproject.toml b/tools/infra/k8s/pyproject.toml new file mode 100644 index 000000000..c82afaac6 --- /dev/null +++ b/tools/infra/k8s/pyproject.toml @@ -0,0 +1,45 @@ +[project] +name = "k8s" +description = "Read-only Kubernetes access to the GKE main and sqd-compute-cluster control planes — namespaces, pods, logs, events, and arbitrary read-only GETs" +version = "0.1.0" +requires-python = ">=3.11" +dependencies = [ + "httpx>=0.27.0", + "typer>=0.12.0", + "rich>=13.0.0", + "python-dotenv>=1.0.0", +] + +[project.scripts] +k8s = "centaur_tool_k8s.cli:app" + +[tool.hatch.build.targets.wheel] +packages = ["."] + +[tool.hatch.build.targets.wheel.sources] +"." = "centaur_tool_k8s" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + + +[tool.centaur] +module = "client.py" +# Egress allowlist: the two kube API server endpoints. These are IP:host forms +# because the clusters expose their control plane by IP, not DNS. Must match the +# CONNECT target the httpx client dials (KUBE__SERVER host). +hosts = ["35.246.168.135", "162.19.107.87:6443"] +# One inject-mode HTTP secret per cluster: iron-proxy sets `Authorization: Bearer +# ` on requests to that cluster's API host. The token is a long-lived +# ServiceAccount token bound to the built-in `view` ClusterRole (see +# contrib/k8s-readonly/), so the grant is read-only by construction. The client +# never sees the real token; it sends a placeholder the proxy overwrites. +# +# OP items (op:////credential): +# K8S_GKE_TOKEN — SA token in GKE main +# K8S_SQD_TOKEN — SA token in sqd-compute-cluster +secrets = [ + { type = "http", name = "K8S_GKE_TOKEN", mode = "inject", inject_header = "Authorization", inject_formatter = "Bearer {{ .Value }}", hosts = ["35.246.168.135"] }, + { type = "http", name = "K8S_SQD_TOKEN", mode = "inject", inject_header = "Authorization", inject_formatter = "Bearer {{ .Value }}", hosts = ["162.19.107.87:6443"] }, +]