fix(gcp): use authorized session for workbench notebooks API (#2442)

kunaals · web-flow · commit 5da2616a9466 · 2026-03-03T19:49:18.000Z
### Type of change - [x] Bug fix (non-breaking change that fixes an issue) - [x] Documentation update ### Summary This fixes recurring GCP Vertex AI Workbench sync failures where `notebooks.googleapis.com` location discovery returned `401 Unauthorized` in production/staging runs. Changes made: - Switched Workbench Notebooks API calls to use `google.auth.transport.requests.AuthorizedSession` instead of manually constructing `Authorization: Bearer ...` headers from discovery credentials. - Extended `paginate_vertex_api()` to optionally execute requests via a provided authorized session (used by Workbench v2 instance listing path). - Added focused unit tests for: - authorized-session usage in Workbench location discovery - `401` handling behavior - passing the authorized session through to paginated Workbench instance calls - Updated GCP configuration docs to include: - optional `roles/notebooks.viewer` and `roles/run.viewer` - enabling `notebooks.googleapis.com` - note on per-location Cloud Run permission warnings being skipped gracefully ### Breaking changes None. ### How was this tested? - added tests - tested locally: ### Checklist #### General - [x] I have read the [contributing guidelines](https://cartography-cncf.github.io/cartography/dev/developer-guide.html). - [x] The linter passes locally (`make lint`). - [x] I have added/updated tests that prove my fix is effective or my feature works. #### Proof of functionality - [x] New or updated unit/integration tests. #### If you are changing a node or relationship - [ ] Updated the [schema documentation](https://github.com/cartography-cncf/cartography/tree/master/docs/root/modules). - [ ] Updated the [schema README](https://github.com/cartography-cncf/cartography/blob/master/docs/schema/README.md). #### If you are implementing a new intel module - [ ] Used the NodeSchema [data model](https://cartography-cncf.github.io/cartography/dev/writing-intel-modules.html#defining-a-node). ### Notes for reviewers - The `401` appeared consistently in multiple production-like runs for Workbench location discovery, while other Vertex API calls succeeded in the same runs. - This PR intentionally keeps behavior non-fatal for Workbench discovery failures (skip Workbench for that project/run), matching existing graceful-degradation behavior. --------- Signed-off-by: Kunaal Sikka <kunaal@subimage.io>
diff --git a/cartography/intel/gcp/vertex/instances.py b/cartography/intel/gcp/vertex/instances.py
@@ -1,8 +1,10 @@
 import logging
 from typing import Dict
 from typing import List
+from typing import Optional
 
 import neo4j
+import requests
 from googleapiclient.discovery import Resource
 
 from cartography.client.core.tx import load
@@ -14,98 +16,105 @@
 
 
 @timeit
-def get_workbench_api_locations(aiplatform: Resource, project_id: str) -> List[str]:
+def get_workbench_api_locations(
+    aiplatform: Resource,
+    project_id: str,
+) -> Optional[List[str]]:
     """
     Gets all available Workbench (In Notebooks API) API locations for a project.
     The Notebooks API uses both zones and regions, unlike Vertex AI which primarily uses regions.
     Filters to commonly-used locations to improve sync performance.
     """
-    import requests
-    from google.auth.transport.requests import Request as AuthRequest
+    from google.auth.transport.requests import AuthorizedSession
 
-    # Get credentials and refresh token if needed
     creds = aiplatform._http.credentials
-    if not creds.valid:
-        creds.refresh(AuthRequest())
+    session = AuthorizedSession(creds)
 
     # Query Notebooks API for available locations
     notebooks_endpoint = "https://notebooks.googleapis.com"
     url = f"{notebooks_endpoint}/v1/projects/{project_id}/locations"
-    headers = {
-        "Authorization": f"Bearer {creds.token}",
-        "Content-Type": "application/json",
-    }
+    response = session.get(url, timeout=60)
+
+    if response.status_code == 401:
+        logger.warning(
+            "Unauthorized when trying to get Notebooks API locations for project %s. "
+            "Ensure credentials are valid for notebooks.googleapis.com and that the "
+            "Notebooks API is enabled on the host/quota project.",
+            project_id,
+        )
+        return None
+    if response.status_code == 403:
+        logger.warning(
+            "Access forbidden when trying to get Notebooks API locations for project %s. "
+            "Ensure the Notebooks API is enabled and you have the necessary permissions.",
+            project_id,
+        )
+        return None
+    if response.status_code == 404:
+        logger.warning(
+            "Notebooks API locations not found for project %s. "
+            "The Notebooks API may not be enabled.",
+            project_id,
+        )
+        return None
 
     try:
-        response = requests.get(url, headers=headers)
         response.raise_for_status()
-        data = response.json()
-
-        # Filter to commonly-used locations to avoid excessive API calls
-        # Include major regions and their zones
-        # Reference: https://cloud.google.com/vertex-ai/docs/general/locations
-        supported_prefixes = {
-            "us-central1",
-            "us-east1",
-            "us-east4",
-            "us-west1",
-            "us-west2",
-            "us-west3",
-            "us-west4",
-            "europe-west1",
-            "europe-west2",
-            "europe-west3",
-            "europe-west4",
-            "asia-east1",
-            "asia-northeast1",
-            "asia-northeast3",
-            "asia-southeast1",
-            "australia-southeast1",
-            "northamerica-northeast1",
-            "southamerica-east1",
-        }
-
-        locations = []
-        all_locations = data.get("locations", [])
-        for location in all_locations:
-            # Extract location ID from the full path
-            # Format: "projects/PROJECT_ID/locations/LOCATION_ID"
-            location_id = location.get("locationId", "")
-
-            # Check if this location matches any of our supported prefixes
-            # This handles both regions (us-central1) and zones (us-central1-a, us-central1-b)
-            if any(location_id.startswith(prefix) for prefix in supported_prefixes):
-                locations.append(location_id)
-
-        logger.info(
-            f"Found {len(locations)} supported Notebooks API locations "
-            f"(filtered from {len(all_locations)} total) for project {project_id}"
-        )
-        return locations
-
-    except requests.exceptions.HTTPError as e:
-        if e.response.status_code == 403:
-            logger.warning(
-                f"Access forbidden when trying to get Notebooks API locations for project {project_id}. "
-                "Ensure the Notebooks API is enabled and you have the necessary permissions.",
-            )
-        elif e.response.status_code == 404:
-            logger.warning(
-                f"Notebooks API locations not found for project {project_id}. "
-                "The Notebooks API may not be enabled.",
-            )
-        else:
-            logger.error(
-                f"Error getting Notebooks API locations for project {project_id}: {e}",
-                exc_info=True,
-            )
-        return []
-    except Exception as e:
+    except requests.HTTPError:
         logger.error(
-            f"Unexpected error getting Notebooks API locations for project {project_id}: {e}",
+            "Error getting Notebooks API locations for project %s: HTTP %s - %s",
+            project_id,
+            response.status_code,
+            response.reason,
             exc_info=True,
         )
-        return []
+        raise
+
+    data = response.json()
+
+    # Filter to commonly-used locations to avoid excessive API calls
+    # Include major regions and their zones
+    # Reference: https://cloud.google.com/vertex-ai/docs/general/locations
+    supported_prefixes = {
+        "us-central1",
+        "us-east1",
+        "us-east4",
+        "us-west1",
+        "us-west2",
+        "us-west3",
+        "us-west4",
+        "europe-west1",
+        "europe-west2",
+        "europe-west3",
+        "europe-west4",
+        "asia-east1",
+        "asia-northeast1",
+        "asia-northeast3",
+        "asia-southeast1",
+        "australia-southeast1",
+        "northamerica-northeast1",
+        "southamerica-east1",
+    }
+
+    locations = []
+    all_locations = data.get("locations", [])
+    for location in all_locations:
+        # Extract location ID from the full path
+        # Format: "projects/PROJECT_ID/locations/LOCATION_ID"
+        location_id = location.get("locationId", "")
+
+        # Check if this location matches any of our supported prefixes
+        # This handles both regions (us-central1) and zones (us-central1-a, us-central1-b)
+        if any(location_id.startswith(prefix) for prefix in supported_prefixes):
+            locations.append(location_id)
+
+    logger.info(
+        "Found %s supported Notebooks API locations (filtered from %s total) for project %s",
+        len(locations),
+        len(all_locations),
+        project_id,
+    )
+    return locations
 
 
 @timeit
@@ -119,33 +128,28 @@ def get_workbench_instances_for_location(
     Note: This queries the Notebooks API v2 for Workbench instances. The v2 API is used
     by the GCP Console for creating new Workbench instances. The v1 API is deprecated.
     """
-    from google.auth.transport.requests import Request as AuthRequest
+    from google.auth.transport.requests import AuthorizedSession
 
     from cartography.intel.gcp.vertex.utils import paginate_vertex_api
 
-    # Get credentials and refresh token if needed
     creds = aiplatform._http.credentials
-    if not creds.valid:
-        creds.refresh(AuthRequest())
+    session = AuthorizedSession(creds)
 
     # Prepare request parameters for Notebooks API v2
     # Workbench Instances use notebooks.googleapis.com/v2, not aiplatform.googleapis.com
     notebooks_endpoint = "https://notebooks.googleapis.com"
     parent = f"projects/{project_id}/locations/{location}"
-    headers = {
-        "Authorization": f"Bearer {creds.token}",
-        "Content-Type": "application/json",
-    }
     url = f"{notebooks_endpoint}/v2/{parent}/instances"
 
     # Use helper function to handle pagination and error handling
     return paginate_vertex_api(
         url=url,
-        headers=headers,
+        headers={"Content-Type": "application/json"},
         resource_type="workbench instances",
         response_key="instances",
         location=location,
         project_id=project_id,
+        session=session,
     )
 
 
@@ -184,7 +188,8 @@ def transform_workbench_instances(instances: List[Dict]) -> List[Dict]:
         transformed_instances.append(transformed_instance)
 
     logger.info(
-        f"Transformed {len(transformed_instances)} Vertex AI Workbench instances"
+        "Transformed %s Vertex AI Workbench instances",
+        len(transformed_instances),
     )
     return transformed_instances
 
@@ -234,6 +239,12 @@ def sync_workbench_instances(
     # Note: We use the Notebooks API location list, not Vertex AI locations, because
     # Workbench Instances can be deployed in zones (e.g., us-east1-b) not just regions
     locations = get_workbench_api_locations(aiplatform, project_id)
+    if locations is None:
+        logger.warning(
+            "Skipping Vertex AI Workbench instances sync for project %s to preserve existing data.",
+            project_id,
+        )
+        return
 
     # Collect instances from all locations
     all_instances = []
diff --git a/cartography/intel/gcp/vertex/utils.py b/cartography/intel/gcp/vertex/utils.py
@@ -31,20 +31,29 @@ def handle_vertex_api_response(
     """
     if response.status_code == 404:
         logger.debug(
-            f"Vertex AI {resource_type} not found in {location} for project {project_id}. "
-            f"This location may not have any {resource_type}."
+            "Vertex AI %s not found in %s for project %s. This location may not have any %s.",
+            resource_type,
+            location,
+            project_id,
+            resource_type,
         )
         return None, False
     elif response.status_code == 403:
         logger.warning(
-            f"Access forbidden when trying to get Vertex AI {resource_type} in {location} "
-            f"for project {project_id}."
+            "Access forbidden when trying to get Vertex AI %s in %s for project %s.",
+            resource_type,
+            location,
+            project_id,
         )
         return None, False
     elif response.status_code != 200:
         logger.error(
-            f"Error getting Vertex AI {resource_type} in {location} for project {project_id}: "
-            f"HTTP {response.status_code} - {response.reason}",
+            "Error getting Vertex AI %s in %s for project %s: HTTP %s - %s",
+            resource_type,
+            location,
+            project_id,
+            response.status_code,
+            response.reason,
             exc_info=False,
         )
         return None, False
@@ -55,34 +64,40 @@ def handle_vertex_api_response(
 
 def paginate_vertex_api(
     url: str,
-    headers: Dict[str, str],
+    headers: Optional[Dict[str, str]],
     resource_type: str,
     response_key: str,
     location: str,
     project_id: str,
+    session: Optional[Any] = None,
 ) -> List[Dict]:
     """
     Handle paginated requests to Vertex AI regional endpoints.
 
     :param url: Base API URL (without pagination params)
-    :param headers: HTTP headers including Authorization
+    :param headers: Optional HTTP headers
     :param resource_type: Type of resource (for logging)
     :param response_key: Key in JSON response containing the resource list
     :param location: GCP location/region
     :param project_id: GCP project ID
+    :param session: Optional authorized session used to execute requests
     :return: List of all resources across all pages
     """
     import requests
 
     resources = []
     page_token = None
+    request_headers = headers or {}
 
     while True:
         params: Dict[str, str] = {}
         if page_token:
             params["pageToken"] = page_token
 
-        response = requests.get(url, headers=headers, params=params)
+        if session is not None:
+            response = session.get(url, headers=request_headers, params=params)
+        else:
+            response = requests.get(url, headers=request_headers, params=params)
 
         # Handle response with common error patterns
         data, should_continue = handle_vertex_api_response(
@@ -101,6 +116,10 @@ def paginate_vertex_api(
             break
 
     logger.info(
-        f"Found {len(resources)} Vertex AI {resource_type} in {location} for project {project_id}"
+        "Found %s Vertex AI %s in %s for project %s",
+        len(resources),
+        resource_type,
+        location,
+        project_id,
     )
     return resources
diff --git a/docs/root/modules/gcp/config.md b/docs/root/modules/gcp/config.md
@@ -19,6 +19,8 @@ Grant the following roles to the identity at the **organization level**. This en
 | `roles/bigquery.connectionUser` | List BigQuery connections | Optional |
 | `roles/cloudasset.viewer` | Sync IAM policy bindings (effective policies across org hierarchy) | Optional |
 | `roles/artifactregistry.reader` | List/get Artifact Registry repositories and artifacts | Optional |
+| `roles/run.viewer` | List/get Cloud Run services, jobs, and executions | Optional |
+| `roles/notebooks.viewer` | List/get Vertex AI Workbench (Notebooks API) resources | Optional |
 
 To grant a role at the organization level:
 ```bash
@@ -68,6 +70,7 @@ gcloud services enable secretmanager.googleapis.com --project=YOUR_HOST_PROJECT
 gcloud services enable artifactregistry.googleapis.com --project=YOUR_HOST_PROJECT
 gcloud services enable run.googleapis.com --project=YOUR_HOST_PROJECT
 gcloud services enable aiplatform.googleapis.com --project=YOUR_HOST_PROJECT
+gcloud services enable notebooks.googleapis.com --project=YOUR_HOST_PROJECT
 gcloud services enable cloudasset.googleapis.com --project=YOUR_HOST_PROJECT
 ```
 
@@ -79,6 +82,8 @@ If you set `GOOGLE_CLOUD_QUOTA_PROJECT` to override the default quota project, e
 
 If an API is not enabled on your host/quota project, Cartography will log a warning and skip syncing that resource type rather than crashing. Other modules will continue normally.
 
+Some services also emit per-location permission warnings (for example Cloud Run in restricted regions). Cartography logs these and skips only affected locations.
+
 ### Cloud Asset Inventory (CAI)
 
 Cartography uses the [Cloud Asset Inventory API](https://cloud.google.com/asset-inventory/docs/overview) for two features:
diff --git a/tests/unit/cartography/intel/gcp/vertex/test_instances.py b/tests/unit/cartography/intel/gcp/vertex/test_instances.py