From 0861cade5a17228f35d03c3ebd200dcb04e4ddab Mon Sep 17 00:00:00 2001 From: Tom Kirkpatrick <200251+mrfelton@users.noreply.github.com> Date: Thu, 11 Jun 2026 19:15:10 +0200 Subject: [PATCH 1/2] fix: stop token-optimizer self-targeting the AIC monitoring family MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Phase 1 self-targeting guard ("Exclude workflows with \"Token\" in the name") can never fire: gh aw logs reports `workflow_name` as the display name (the H1), which for these two workflows is "Agentic Workflow AIC Usage Optimizer" / "Daily Agentic Workflow AIC Usage Audit" — neither contains "Token". The string only appears in the workflow id / tracker-id. So the monitoring family is never excluded, and the optimizer selects itself (or its sibling audit) as the target. This fixes it two ways: - Deterministic pre-filter in the "Download" step: drop runs whose workflow_path matches `agentic-token-(optimizer|audit)` or whose display name matches `AIC Usage (Optimizer|Audit)`, so neither all-runs.json nor top-workflows.json can list the family. - Correct the Phase 1 prompt guard to key off the actual ids/display names instead of the substring "Token". Source `.md` only — the generated `.lock.yml` files are intentionally left for in-repo regeneration via `gh aw compile` (lock cron jitter and action-pin resolution are repo-scoped, so fork-built locks would not match this repo's CI output). Fixes #119 Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/agentic-token-optimizer.md | 13 ++++++++++++- workflows/agentic-token-optimizer.md | 13 ++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/.github/workflows/agentic-token-optimizer.md b/.github/workflows/agentic-token-optimizer.md index 6117f5e..ce3da79 100644 --- a/.github/workflows/agentic-token-optimizer.md +++ b/.github/workflows/agentic-token-optimizer.md @@ -58,6 +58,17 @@ steps: echo '{"runs":[],"summary":{}}' > /tmp/gh-aw/token-audit/all-runs.json fi + # Exclude the AIC monitoring family (this optimizer + its sibling audit) from the + # candidate pool so the optimizer never selects its own meta-monitoring workflows. + # The in-prompt "Token in the name" guard misses these: their display names are + # "...AIC Usage Optimizer/Audit" (no "Token"), so match on workflow id/name here. + jq '.runs |= map(select( + (((.workflow_path // "") | test("agentic-token-(optimizer|audit)")) + or ((.workflow_name // "") | test("AIC Usage (Optimizer|Audit)"))) | not + ))' /tmp/gh-aw/token-audit/all-runs.json > /tmp/gh-aw/token-audit/all-runs.filtered.json \ + && mv /tmp/gh-aw/token-audit/all-runs.filtered.json /tmp/gh-aw/token-audit/all-runs.json + echo "🚫 Excluded AIC monitoring family — $(jq '.runs | length' /tmp/gh-aw/token-audit/all-runs.json) runs remain in candidate pool" + - name: Aggregate top workflows by AIC usage run: | set -euo pipefail @@ -161,7 +172,7 @@ Treat missing numeric fields (`aic`, `token_usage`, `turns`, `action_minutes`) a - Start from `top-workflows.json`. - Exclude workflows optimized in the last 14 days (use `optimization-log.json`). -- Exclude workflows with "Token" in the name to avoid self-targeting. +- Exclude the AIC monitoring family — the `agentic-token-optimizer` and `agentic-token-audit` workflows (display names "Agentic Workflow AIC Usage Optimizer" / "Daily Agentic Workflow AIC Usage Audit") — to avoid self-targeting. These are also pre-filtered from `all-runs.json`/`top-workflows.json`, but never select them even if a stale snapshot still lists them. - Choose the highest AI-credit-spend workflow that remains. - If no snapshot/history exists, derive candidates directly from `all-runs.json`. diff --git a/workflows/agentic-token-optimizer.md b/workflows/agentic-token-optimizer.md index 6117f5e..ce3da79 100644 --- a/workflows/agentic-token-optimizer.md +++ b/workflows/agentic-token-optimizer.md @@ -58,6 +58,17 @@ steps: echo '{"runs":[],"summary":{}}' > /tmp/gh-aw/token-audit/all-runs.json fi + # Exclude the AIC monitoring family (this optimizer + its sibling audit) from the + # candidate pool so the optimizer never selects its own meta-monitoring workflows. + # The in-prompt "Token in the name" guard misses these: their display names are + # "...AIC Usage Optimizer/Audit" (no "Token"), so match on workflow id/name here. + jq '.runs |= map(select( + (((.workflow_path // "") | test("agentic-token-(optimizer|audit)")) + or ((.workflow_name // "") | test("AIC Usage (Optimizer|Audit)"))) | not + ))' /tmp/gh-aw/token-audit/all-runs.json > /tmp/gh-aw/token-audit/all-runs.filtered.json \ + && mv /tmp/gh-aw/token-audit/all-runs.filtered.json /tmp/gh-aw/token-audit/all-runs.json + echo "🚫 Excluded AIC monitoring family — $(jq '.runs | length' /tmp/gh-aw/token-audit/all-runs.json) runs remain in candidate pool" + - name: Aggregate top workflows by AIC usage run: | set -euo pipefail @@ -161,7 +172,7 @@ Treat missing numeric fields (`aic`, `token_usage`, `turns`, `action_minutes`) a - Start from `top-workflows.json`. - Exclude workflows optimized in the last 14 days (use `optimization-log.json`). -- Exclude workflows with "Token" in the name to avoid self-targeting. +- Exclude the AIC monitoring family — the `agentic-token-optimizer` and `agentic-token-audit` workflows (display names "Agentic Workflow AIC Usage Optimizer" / "Daily Agentic Workflow AIC Usage Audit") — to avoid self-targeting. These are also pre-filtered from `all-runs.json`/`top-workflows.json`, but never select them even if a stale snapshot still lists them. - Choose the highest AI-credit-spend workflow that remains. - If no snapshot/history exists, derive candidates directly from `all-runs.json`. From 23e3cd903ad41f7b06f6d39f5b04e4811f16e416 Mon Sep 17 00:00:00 2001 From: Tom Kirkpatrick <200251+mrfelton@users.noreply.github.com> Date: Fri, 12 Jun 2026 07:18:10 +0200 Subject: [PATCH 2/2] fix: fetch AIC logs per workflow to survive high-CI-volume repos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The audit reported empty days ("0 completed runs") and the optimizer's candidate pool silently truncated in busy repos: unfiltered `gh aw logs` scans repo-wide run batches and stops paginating when a batch filters down to zero processable agentic runs (skipped/cancelled dropped first) — github/gh-aw#38782. 250 repo-wide runs span ~2h of CI in a large monorepo, so the call rarely saw past the most recent runs. Enumerate agentic workflows from .github/workflows/*.lock.yml and call `gh aw logs ` per workflow (workflow-scoped listing is unaffected by repo CI volume), then merge results deduped by run_id into the same {summary, runs} shape the agent prompts already expect. Validated in a production monorepo over a window that produced an empty audit: unfiltered call returned 1 run (the audit itself); per-workflow loop returned 15 runs including 5016/3674/2548-AIC runs. Fixes #123 Co-Authored-By: Claude Fable 5 --- .github/workflows/agentic-token-audit.md | 53 +++++++++++++------- .github/workflows/agentic-token-optimizer.md | 49 +++++++++++++----- workflows/agentic-token-audit.md | 53 +++++++++++++------- workflows/agentic-token-optimizer.md | 49 +++++++++++++----- 4 files changed, 144 insertions(+), 60 deletions(-) diff --git a/.github/workflows/agentic-token-audit.md b/.github/workflows/agentic-token-audit.md index 5a4aef9..89c97d0 100644 --- a/.github/workflows/agentic-token-audit.md +++ b/.github/workflows/agentic-token-audit.md @@ -51,27 +51,46 @@ steps: set -euo pipefail mkdir -p /tmp/gh-aw/token-audit - # Download last 24 hours of agentic workflow logs as JSON - # Allow partial results — gh aw logs streams incrementally, so even if - # it hits an API rate limit partway through, the JSON written so far is - # still valid and should be processed by the agent. - LOGS_EXIT=0 - gh aw logs \ - --start-date -1d \ - --json \ - -c 100 \ - > /tmp/gh-aw/token-audit/workflow-logs.json || LOGS_EXIT=$? - - if [ -s /tmp/gh-aw/token-audit/workflow-logs.json ]; then - TOTAL=$(jq '.runs | length' /tmp/gh-aw/token-audit/workflow-logs.json) - echo "✅ Downloaded $TOTAL agentic workflow runs (last 24 hours)" - if [ "$LOGS_EXIT" -ne 0 ]; then - echo "⚠️ gh aw logs exited with code $LOGS_EXIT (partial results — likely API rate limit)" + # Download last 24 hours of agentic workflow logs as JSON, one workflow + # at a time. `gh aw logs` without a workflow filter scans repo-wide + # `gh run list` batches (newest-first, 250 runs each) and stops + # paginating as soon as one batch contains no processable agentic runs + # (skipped/cancelled runs are dropped before the empty-batch check — + # see github/gh-aw#38782). In a high-CI-volume repo a batch spans only + # a couple of hours, so the unfiltered call usually saw nothing but + # this run itself and reported an empty day. Workflow-scoped listing is + # unaffected by repo CI volume. Partial results are fine — each + # per-workflow file that was written successfully still gets merged. + PARTS_DIR=/tmp/gh-aw/token-audit/log-parts + mkdir -p "$PARTS_DIR" + + for lock in .github/workflows/*.lock.yml; do + id=$(basename "$lock" .lock.yml) + PART_EXIT=0 + gh aw logs "$id" \ + --start-date -1d \ + --json \ + -c 100 \ + > "$PARTS_DIR/$id.json" || PART_EXIT=$? + if [ -s "$PARTS_DIR/$id.json" ]; then + COUNT=$(jq '.runs | length' "$PARTS_DIR/$id.json" 2>/dev/null || echo 0) + echo "✅ $id: $COUNT runs (exit code $PART_EXIT)" + else + echo "⚠️ $id: no log data (exit code $PART_EXIT)" + rm -f "$PARTS_DIR/$id.json" fi + done + + if ls "$PARTS_DIR"/*.json >/dev/null 2>&1; then + jq -s '{summary: {}, runs: (map(.runs // []) | add | unique_by(.run_id))}' \ + "$PARTS_DIR"/*.json > /tmp/gh-aw/token-audit/workflow-logs.json else - echo "❌ No log data downloaded (exit code $LOGS_EXIT)" + echo "❌ No log data downloaded for any workflow" echo '{"runs":[],"summary":{}}' > /tmp/gh-aw/token-audit/workflow-logs.json fi + + TOTAL=$(jq '.runs | length' /tmp/gh-aw/token-audit/workflow-logs.json) + echo "✅ Merged $TOTAL agentic workflow runs (last 24 hours)" timeout-minutes: 25 --- diff --git a/.github/workflows/agentic-token-optimizer.md b/.github/workflows/agentic-token-optimizer.md index ce3da79..df8d00d 100644 --- a/.github/workflows/agentic-token-optimizer.md +++ b/.github/workflows/agentic-token-optimizer.md @@ -40,24 +40,47 @@ steps: echo "📥 Downloading agentic workflow logs (last 7 days)..." - LOGS_EXIT=0 - gh aw logs \ - --start-date -7d \ - --json \ - -c 50 \ - > /tmp/gh-aw/token-audit/all-runs.json || LOGS_EXIT=$? - - if [ -s /tmp/gh-aw/token-audit/all-runs.json ]; then - TOTAL=$(jq '.runs | length' /tmp/gh-aw/token-audit/all-runs.json) - echo "✅ Downloaded $TOTAL agentic workflow runs (last 7 days)" - if [ "$LOGS_EXIT" -ne 0 ]; then - echo "⚠️ gh aw logs exited with code $LOGS_EXIT (partial results — likely API rate limit)" + # Fetch logs one workflow at a time. `gh aw logs` without a workflow + # filter scans repo-wide `gh run list` batches (newest-first, 250 runs + # each) and stops paginating as soon as one batch contains no + # processable agentic runs (skipped/cancelled runs are dropped before + # the empty-batch check — see github/gh-aw#38782). In a high-CI-volume + # repo a batch spans only a couple of hours, so the unfiltered call + # truncates the candidate pool to whatever ran most recently. + # Workflow-scoped listing is unaffected by repo CI volume. Partial + # results are fine — each per-workflow file that was written + # successfully still gets merged. + PARTS_DIR=/tmp/gh-aw/token-audit/log-parts + mkdir -p "$PARTS_DIR" + + for lock in .github/workflows/*.lock.yml; do + id=$(basename "$lock" .lock.yml) + PART_EXIT=0 + gh aw logs "$id" \ + --start-date -7d \ + --json \ + -c 50 \ + > "$PARTS_DIR/$id.json" || PART_EXIT=$? + if [ -s "$PARTS_DIR/$id.json" ]; then + COUNT=$(jq '.runs | length' "$PARTS_DIR/$id.json" 2>/dev/null || echo 0) + echo "✅ $id: $COUNT runs (exit code $PART_EXIT)" + else + echo "⚠️ $id: no log data (exit code $PART_EXIT)" + rm -f "$PARTS_DIR/$id.json" fi + done + + if ls "$PARTS_DIR"/*.json >/dev/null 2>&1; then + jq -s '{summary: {}, runs: (map(.runs // []) | add | unique_by(.run_id))}' \ + "$PARTS_DIR"/*.json > /tmp/gh-aw/token-audit/all-runs.json else - echo "❌ No log data downloaded (exit code $LOGS_EXIT)" + echo "❌ No log data downloaded for any workflow" echo '{"runs":[],"summary":{}}' > /tmp/gh-aw/token-audit/all-runs.json fi + TOTAL=$(jq '.runs | length' /tmp/gh-aw/token-audit/all-runs.json) + echo "✅ Merged $TOTAL agentic workflow runs (last 7 days)" + # Exclude the AIC monitoring family (this optimizer + its sibling audit) from the # candidate pool so the optimizer never selects its own meta-monitoring workflows. # The in-prompt "Token in the name" guard misses these: their display names are diff --git a/workflows/agentic-token-audit.md b/workflows/agentic-token-audit.md index 5a4aef9..89c97d0 100644 --- a/workflows/agentic-token-audit.md +++ b/workflows/agentic-token-audit.md @@ -51,27 +51,46 @@ steps: set -euo pipefail mkdir -p /tmp/gh-aw/token-audit - # Download last 24 hours of agentic workflow logs as JSON - # Allow partial results — gh aw logs streams incrementally, so even if - # it hits an API rate limit partway through, the JSON written so far is - # still valid and should be processed by the agent. - LOGS_EXIT=0 - gh aw logs \ - --start-date -1d \ - --json \ - -c 100 \ - > /tmp/gh-aw/token-audit/workflow-logs.json || LOGS_EXIT=$? - - if [ -s /tmp/gh-aw/token-audit/workflow-logs.json ]; then - TOTAL=$(jq '.runs | length' /tmp/gh-aw/token-audit/workflow-logs.json) - echo "✅ Downloaded $TOTAL agentic workflow runs (last 24 hours)" - if [ "$LOGS_EXIT" -ne 0 ]; then - echo "⚠️ gh aw logs exited with code $LOGS_EXIT (partial results — likely API rate limit)" + # Download last 24 hours of agentic workflow logs as JSON, one workflow + # at a time. `gh aw logs` without a workflow filter scans repo-wide + # `gh run list` batches (newest-first, 250 runs each) and stops + # paginating as soon as one batch contains no processable agentic runs + # (skipped/cancelled runs are dropped before the empty-batch check — + # see github/gh-aw#38782). In a high-CI-volume repo a batch spans only + # a couple of hours, so the unfiltered call usually saw nothing but + # this run itself and reported an empty day. Workflow-scoped listing is + # unaffected by repo CI volume. Partial results are fine — each + # per-workflow file that was written successfully still gets merged. + PARTS_DIR=/tmp/gh-aw/token-audit/log-parts + mkdir -p "$PARTS_DIR" + + for lock in .github/workflows/*.lock.yml; do + id=$(basename "$lock" .lock.yml) + PART_EXIT=0 + gh aw logs "$id" \ + --start-date -1d \ + --json \ + -c 100 \ + > "$PARTS_DIR/$id.json" || PART_EXIT=$? + if [ -s "$PARTS_DIR/$id.json" ]; then + COUNT=$(jq '.runs | length' "$PARTS_DIR/$id.json" 2>/dev/null || echo 0) + echo "✅ $id: $COUNT runs (exit code $PART_EXIT)" + else + echo "⚠️ $id: no log data (exit code $PART_EXIT)" + rm -f "$PARTS_DIR/$id.json" fi + done + + if ls "$PARTS_DIR"/*.json >/dev/null 2>&1; then + jq -s '{summary: {}, runs: (map(.runs // []) | add | unique_by(.run_id))}' \ + "$PARTS_DIR"/*.json > /tmp/gh-aw/token-audit/workflow-logs.json else - echo "❌ No log data downloaded (exit code $LOGS_EXIT)" + echo "❌ No log data downloaded for any workflow" echo '{"runs":[],"summary":{}}' > /tmp/gh-aw/token-audit/workflow-logs.json fi + + TOTAL=$(jq '.runs | length' /tmp/gh-aw/token-audit/workflow-logs.json) + echo "✅ Merged $TOTAL agentic workflow runs (last 24 hours)" timeout-minutes: 25 --- diff --git a/workflows/agentic-token-optimizer.md b/workflows/agentic-token-optimizer.md index ce3da79..df8d00d 100644 --- a/workflows/agentic-token-optimizer.md +++ b/workflows/agentic-token-optimizer.md @@ -40,24 +40,47 @@ steps: echo "📥 Downloading agentic workflow logs (last 7 days)..." - LOGS_EXIT=0 - gh aw logs \ - --start-date -7d \ - --json \ - -c 50 \ - > /tmp/gh-aw/token-audit/all-runs.json || LOGS_EXIT=$? - - if [ -s /tmp/gh-aw/token-audit/all-runs.json ]; then - TOTAL=$(jq '.runs | length' /tmp/gh-aw/token-audit/all-runs.json) - echo "✅ Downloaded $TOTAL agentic workflow runs (last 7 days)" - if [ "$LOGS_EXIT" -ne 0 ]; then - echo "⚠️ gh aw logs exited with code $LOGS_EXIT (partial results — likely API rate limit)" + # Fetch logs one workflow at a time. `gh aw logs` without a workflow + # filter scans repo-wide `gh run list` batches (newest-first, 250 runs + # each) and stops paginating as soon as one batch contains no + # processable agentic runs (skipped/cancelled runs are dropped before + # the empty-batch check — see github/gh-aw#38782). In a high-CI-volume + # repo a batch spans only a couple of hours, so the unfiltered call + # truncates the candidate pool to whatever ran most recently. + # Workflow-scoped listing is unaffected by repo CI volume. Partial + # results are fine — each per-workflow file that was written + # successfully still gets merged. + PARTS_DIR=/tmp/gh-aw/token-audit/log-parts + mkdir -p "$PARTS_DIR" + + for lock in .github/workflows/*.lock.yml; do + id=$(basename "$lock" .lock.yml) + PART_EXIT=0 + gh aw logs "$id" \ + --start-date -7d \ + --json \ + -c 50 \ + > "$PARTS_DIR/$id.json" || PART_EXIT=$? + if [ -s "$PARTS_DIR/$id.json" ]; then + COUNT=$(jq '.runs | length' "$PARTS_DIR/$id.json" 2>/dev/null || echo 0) + echo "✅ $id: $COUNT runs (exit code $PART_EXIT)" + else + echo "⚠️ $id: no log data (exit code $PART_EXIT)" + rm -f "$PARTS_DIR/$id.json" fi + done + + if ls "$PARTS_DIR"/*.json >/dev/null 2>&1; then + jq -s '{summary: {}, runs: (map(.runs // []) | add | unique_by(.run_id))}' \ + "$PARTS_DIR"/*.json > /tmp/gh-aw/token-audit/all-runs.json else - echo "❌ No log data downloaded (exit code $LOGS_EXIT)" + echo "❌ No log data downloaded for any workflow" echo '{"runs":[],"summary":{}}' > /tmp/gh-aw/token-audit/all-runs.json fi + TOTAL=$(jq '.runs | length' /tmp/gh-aw/token-audit/all-runs.json) + echo "✅ Merged $TOTAL agentic workflow runs (last 7 days)" + # Exclude the AIC monitoring family (this optimizer + its sibling audit) from the # candidate pool so the optimizer never selects its own meta-monitoring workflows. # The in-prompt "Token in the name" guard misses these: their display names are