diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
new file mode 100644
index 000000000..e81c65f64
--- /dev/null
+++ b/.github/workflows/benchmark-gpu.yml
@@ -0,0 +1,384 @@
+name: Benchmark GPU (PR)
+
+# Rent an RTX 5090 on Vast.ai (hourly) and run the drift-free A/B/B/A (ABBA) paired
+# prover benchmark — the same method as the CPU `/bench-abba` (scripts/bench_abba.sh) —
+# but with the CUDA prover path enabled (BENCH_FEATURES=jemalloc-stats,prover/cuda).
+# It builds the cli at the PR head and at main, runs N interleaved pairs on the GPU,
+# posts the paired-t + Wilcoxon verdict back to the PR, then ALWAYS destroys the box.
+#
+# Triggered by a "/bench-gpu [N]" comment on a PR (N = pair count, default 10) or via
+# workflow_dispatch. Orchestration runs on a GitHub-hosted runner; all GPU work happens
+# on the rented Vast box (provisioned by the template onstart).
+#
+# Requires repo secrets:
+#   VAST_API_KEY        — https://cloud.vast.ai/manage-keys/
+#   VAST_TEMPLATE_HASH  — hash of the "NVIDIA CUDA Lambda VM 64GB" template
+
+on:
+  workflow_dispatch:
+    inputs:
+      pairs:
+        description: "Number of A/B/B/A pairs"
+        default: "1"   # TEMP(testing): fast runs; restore to "10" before merge
+  issue_comment:
+    types: [created]
+  # TEMP(testing): lets the workflow run from this branch before it's on the default
+  # branch (push uses the branch's own definition; issue_comment/workflow_dispatch do
+  # not). REMOVE this push trigger before merging.
+  push:
+    branches: [gpu_benchmarks]
+
+permissions:
+  contents: read
+  pull-requests: write
+  issues: write
+
+concurrency:
+  group: benchmark-gpu-${{ github.event.issue.number || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  # Vast offer search: RTX 5090, >=16 cores, >=96GB RAM, >=64GB disk, verified +
+  # rentable, Blackwell-capable driver, <= cap.
+  GPU_NAME: RTX_5090
+  PRICE_CAP: "3"
+  VAST_IMAGE_DISK: "64"
+  # cli features for the ABBA build — the GPU (cuda) prover path plus jemalloc heap stats.
+  BENCH_FEATURES: "jemalloc-stats,prover/cuda"
+
+jobs:
+  benchmark-gpu:
+    runs-on: ubuntu-latest
+    # Skip unless: workflow_dispatch, or a "/bench-gpu" comment from a privileged author.
+    # TEMP(testing): `github.event_name == 'push'` lets branch pushes run it pre-merge.
+    # REMOVE the push clause before merging.
+    if: >-
+      github.event_name == 'push' ||
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'issue_comment' &&
+       github.event.issue.pull_request &&
+       startsWith(github.event.comment.body, '/bench-gpu') &&
+       contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association))
+    # ABBA on the GPU: dual cuda build (~15-30 min) + 2*pairs proves (~77s each).
+    timeout-minutes: 180
+    steps:
+      - name: Resolve PR ref + pair count
+        id: config
+        env:
+          GH_TOKEN: ${{ github.token }}
+          EVENT_NAME: ${{ github.event_name }}
+          COMMENT_BODY: ${{ github.event.comment.body }}
+          PR_NUM: ${{ github.event.issue.number }}
+          DISPATCH_PAIRS: ${{ github.event.inputs.pairs }}
+          DISPATCH_REF: ${{ github.ref_name }}
+        run: |
+          if [ "$EVENT_NAME" = "issue_comment" ]; then
+            # Pin the head SHA (works for fork PRs; avoids a force-push race mid-run).
+            HEAD_SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid)
+            OUT_PR_NUM="$PR_NUM"; OUT_HEAD_SHA="$HEAD_SHA"; OUT_BRANCH=""
+            # "/bench-gpu 20" -> 20 pairs; otherwise default.
+            N=$(echo "$COMMENT_BODY" | sed -n 's|^/bench-gpu[[:space:]]*\([0-9]\+\).*|\1|p')
+            PAIRS=${N:-1}   # TEMP(testing): default 1; restore to 10 before merge
+          else
+            # workflow_dispatch / push: compare this branch vs main.
+            OUT_PR_NUM=""; OUT_HEAD_SHA=""; OUT_BRANCH="$DISPATCH_REF"
+            PAIRS=${DISPATCH_PAIRS:-1}   # TEMP(testing): default 1; restore to 10 before merge
+          fi
+          # TEMP(testing): clamp floor lowered to 1 for fast runs; restore to [2,40] before merge.
+          if [ "$PAIRS" -lt 1 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then
+            echo "::warning::pair count out of range [1,40], defaulting to 1"
+            PAIRS=1
+          fi
+          {
+            echo "pr_num=$OUT_PR_NUM"
+            echo "head_sha=$OUT_HEAD_SHA"
+            echo "branch=$OUT_BRANCH"
+            echo "pairs=$PAIRS"
+          } >> "$GITHUB_OUTPUT"
+          echo "Using $PAIRS A/B/B/A pairs"
+
+      - name: Acknowledge (react + occupancy notice)
+        if: github.event_name == 'issue_comment'
+        uses: actions/github-script@v7
+        env:
+          PAIRS: ${{ steps.config.outputs.pairs }}
+        with:
+          script: |
+            await github.rest.reactions.createForIssueComment({
+              owner: context.repo.owner, repo: context.repo.repo,
+              comment_id: context.payload.comment.id, content: 'eyes'
+            });
+            await github.rest.issues.createComment({
+              owner: context.repo.owner, repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body: `⏳ **GPU ABBA started** — renting an RTX 5090 on Vast.ai and running ${process.env.PAIRS} interleaved pairs (PR vs main) on the CUDA prover path. This takes ~1 hr; results will be posted here.`
+            });
+
+      - name: Install Vast CLI
+        env:
+          VAST_API_KEY: ${{ secrets.VAST_API_KEY }}
+        run: |
+          pip install --quiet --upgrade vastai
+          vastai set api-key "$VAST_API_KEY"
+
+      - name: Generate ephemeral SSH key
+        id: sshkey
+        run: |
+          mkdir -p "$HOME/.ssh"
+          KEY="$HOME/.ssh/vast_bench"
+          ssh-keygen -t ed25519 -N "" -f "$KEY" -C "gh-actions-bench-${GITHUB_RUN_ID}" >/dev/null
+          echo "key_path=$KEY" >> "$GITHUB_OUTPUT"
+
+      - name: Pick a Vast offer
+        id: offer
+        env:
+          # Retry the same query to ride out transient scarcity (datacenter RTX 5090s
+          # are a small, fast-churning pool). Total wait ~= ATTEMPTS * INTERVAL.
+          OFFER_ATTEMPTS: "10"
+          OFFER_INTERVAL: "30"
+          # Require driver >= this major so cudarc (default cuda-version-from-build-system)
+          # matches the runtime driver. Older drivers (e.g. 575) lack newer symbols like
+          # cuCtxGetDevice_v2 and the GPU path falls back to CPU. Filtered client-side in jq
+          # because vast can't numerically compare the driver_version string server-side.
+          MIN_DRIVER: "580"
+        run: |
+          # cpu_ram in the search filter is GB (the returned .cpu_ram field is MB — different
+          # units), so >=96 means 96 GB. >=96000 would mean 96000 GB and match nothing.
+          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
+          echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)"
+          # Keep only offers whose driver major >= MIN_DRIVER, then cheapest first.
+          SELECT="map(select((.driver_version|split(\".\")[0]|tonumber) >= ${MIN_DRIVER})) | sort_by(.dph_total)"
+          OFFER_ID=""
+          for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do
+            vastai search offers "$QUERY" --raw -o dph_total > offers.json || true
+            OFFER_ID=$(jq -r "$SELECT | .[0].id // empty" offers.json)
+            OFFER_PRICE=$(jq -r "$SELECT | .[0].dph_total // empty" offers.json)
+            if [ -n "$OFFER_ID" ]; then
+              echo "Selected offer $OFFER_ID at \$${OFFER_PRICE}/hr (attempt $attempt)"
+              break
+            fi
+            echo "No matching offer (attempt $attempt/$OFFER_ATTEMPTS); retrying in ${OFFER_INTERVAL}s..."
+            sleep "$OFFER_INTERVAL"
+          done
+          if [ -z "$OFFER_ID" ]; then
+            echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, driver>=${MIN_DRIVER}, <= \$${PRICE_CAP}/hr)"
+            exit 1
+          fi
+          echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT"
+          echo "price=$OFFER_PRICE" >> "$GITHUB_OUTPUT"
+
+      - name: Create instance
+        id: instance
+        env:
+          VAST_TEMPLATE_HASH: ${{ secrets.VAST_TEMPLATE_HASH }}
+          OFFER_ID: ${{ steps.offer.outputs.id }}
+        run: |
+          vastai create instance "$OFFER_ID" \
+            --template_hash "$VAST_TEMPLATE_HASH" \
+            --disk "$VAST_IMAGE_DISK" \
+            --ssh --direct --raw > create.json
+          cat create.json
+          IID=$(jq -r '.new_contract // .instances.new_contract // empty' create.json)
+          if [ -z "$IID" ]; then
+            echo "::error::Failed to create Vast instance"
+            exit 1
+          fi
+          # Persist immediately so teardown runs even if later steps fail.
+          echo "$IID" > "$RUNNER_TEMP/vast_instance_id"
+          echo "id=$IID" >> "$GITHUB_OUTPUT"
+          echo "Created instance $IID"
+
+      - name: Attach SSH key to instance
+        env:
+          IID: ${{ steps.instance.outputs.id }}
+          KEY: ${{ steps.sshkey.outputs.key_path }}
+        run: |
+          # Attach the ephemeral pubkey to THIS instance only (added to its authorized_keys).
+          # It's removed when the instance is destroyed, so no account-level key to clean up.
+          # Retry: the instance may not accept the attach immediately after create.
+          PUB="$(cat "$KEY.pub")"
+          for attempt in $(seq 1 12); do
+            if vastai attach ssh "$IID" "$PUB"; then
+              echo "Attached ssh key (attempt $attempt)"; exit 0
+            fi
+            echo "attach failed (attempt $attempt/12); retrying in 10s..."
+            sleep 10
+          done
+          echo "::error::Failed to attach ssh key to instance $IID"
+          exit 1
+
+      - name: Wait for SSH
+        id: ssh
+        env:
+          IID: ${{ steps.instance.outputs.id }}
+        run: |
+          echo "Waiting for instance $IID to reach 'running' with SSH endpoint..."
+          HOST=""; PORT=""
+          for _ in $(seq 1 60); do   # ~10 min
+            vastai show instance "$IID" --raw > inst.json || true
+            STATUS=$(jq -r '.actual_status // empty' inst.json)
+            # We create with --direct, so SSH straight to the public IP + the host port
+            # mapped to container port 22. The .ssh_host/.ssh_port proxy fields are
+            # unreliable (observed off-by-one vs the real proxy port), so use the direct
+            # mapping — same endpoint `vastai ssh-url` reports.
+            HOST=$(jq -r '.public_ipaddr // empty' inst.json)
+            PORT=$(jq -r '.ports["22/tcp"][0].HostPort // empty' inst.json)
+            echo "  status=$STATUS ssh=$HOST:$PORT"
+            if [ "$STATUS" = "running" ] && [ -n "$HOST" ] && [ -n "$PORT" ]; then
+              break
+            fi
+            sleep 10
+          done
+          if [ "$STATUS" != "running" ] || [ -z "$HOST" ] || [ -z "$PORT" ]; then
+            echo "::error::Instance never became reachable (status=$STATUS host=$HOST port=$PORT)"
+            exit 1
+          fi
+          echo "host=$HOST" >> "$GITHUB_OUTPUT"
+          echo "port=$PORT" >> "$GITHUB_OUTPUT"
+
+          # Wait for sshd to accept our key.
+          for _ in $(seq 1 30); do
+            if ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \
+                 -i "${{ steps.sshkey.outputs.key_path }}" -p "$PORT" "root@$HOST" true 2>/dev/null; then
+              echo "sshd reachable"; exit 0
+            fi
+            sleep 10
+          done
+          echo "::error::sshd did not accept connections in time"
+          exit 1
+
+      - name: Wait for onstart provisioning
+        env:
+          HOST: ${{ steps.ssh.outputs.host }}
+          PORT: ${{ steps.ssh.outputs.port }}
+          KEY: ${{ steps.sshkey.outputs.key_path }}
+        run: |
+          SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
+          echo "Waiting for the template onstart script to finish (Rust + LLVM + sysroot + clone)..."
+          # The bootstrap's final stdout line is "=== done ===". Vast captures onstart
+          # output to /var/log/onstart.log; fall back to checking the artifacts it leaves.
+          for _ in $(seq 1 120); do   # ~20 min
+            if $SSH 'grep -q "=== done ===" /var/log/onstart.log 2>/dev/null'; then
+              echo "onstart reported done"; exit 0
+            fi
+            # shellcheck disable=SC2016  # $HOME/$(...) must expand on the remote box, not the runner
+            if $SSH 'test -x "$HOME/.cargo/bin/cargo" \
+                  && test -f /opt/lambda-vm-sysroot/include/stdlib.h \
+                  && test -d /workspace/lambda_vm/.git \
+                  && "$HOME/.cargo/bin/rustup" toolchain list 2>/dev/null | grep -q nightly-2026-02-01'; then
+              echo "provisioning artifacts present"; exit 0
+            fi
+            sleep 10
+          done
+          echo "::error::onstart provisioning did not complete in time"
+          exit 1
+
+      - name: Run GPU ABBA benchmark
+        id: bench
+        env:
+          HOST: ${{ steps.ssh.outputs.host }}
+          PORT: ${{ steps.ssh.outputs.port }}
+          KEY: ${{ steps.sshkey.outputs.key_path }}
+          PR_NUM: ${{ steps.config.outputs.pr_num }}
+          HEAD_SHA: ${{ steps.config.outputs.head_sha }}
+          BRANCH: ${{ steps.config.outputs.branch }}
+          PAIRS: ${{ steps.config.outputs.pairs }}
+        run: |
+          SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
+
+          # Resolve the PR side (REF_A) and the fetch needed to make it resolvable on the box.
+          if [ -n "$PR_NUM" ]; then
+            FETCH="git fetch --force origin refs/pull/$PR_NUM/head"
+            REF_A="$HEAD_SHA"
+          else
+            FETCH="git fetch --force origin $BRANCH"
+            REF_A="origin/$BRANCH"
+          fi
+
+          # The template clones the repo at the DEFAULT branch (main), so check out the PR
+          # ref first — otherwise we'd run main's bench_abba.sh (no BENCH_FEATURES => CPU
+          # build). bench_abba.sh then builds the cli at REF_A and origin/main (isolated
+          # worktree), runs PAIRS interleaved A/B/B/A proves, and prints the paired-t CI +
+          # Wilcoxon verdict. BENCH_FEATURES routes the build through the CUDA prover path.
+          # REBUILD=1: each Vast box is fresh, GPU-specific hardware — always rebuild both
+          # binaries (PTX is compiled for the detected arch); never trust a cached binary.
+          REMOTE="set -e; cd /workspace/lambda_vm; \
+            command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \
+            git fetch --force origin main; $FETCH; \
+            git checkout -f $REF_A; \
+            REBUILD=1 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \
+            scripts/bench_abba.sh $REF_A origin/main $PAIRS"
+
+          $SSH "bash -lc \"$REMOTE\"" | tee "$RUNNER_TEMP/abba_out.txt"
+          # Extract the result section for the PR comment (same marker bench-abba.yml uses).
+          sed -n '/=== ABBA paired result/,$p' "$RUNNER_TEMP/abba_out.txt" > "$RUNNER_TEMP/abba_result.txt"
+
+          # Surface the result in the Actions run summary too (push/workflow_dispatch
+          # runs have no PR to comment on).
+          {
+            echo "## GPU ABBA — ethrex 20 transfers (vs main)"
+            echo '```'
+            cat "$RUNNER_TEMP/abba_result.txt"
+            echo '```'
+          } >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Comment ABBA result on PR
+        if: always() && github.event_name == 'issue_comment'
+        uses: actions/github-script@v7
+        env:
+          HEAD_SHA: ${{ steps.config.outputs.head_sha }}
+          PAIRS: ${{ steps.config.outputs.pairs }}
+          OUTCOME: ${{ steps.bench.outcome }}
+          GPU_NAME: ${{ env.GPU_NAME }}
+          OFFER_PRICE: ${{ steps.offer.outputs.price }}
+        with:
+          script: |
+            const fs = require('fs');
+            const tmp = process.env.RUNNER_TEMP;
+            const read = (p) => { try { return fs.readFileSync(p, 'utf8').trim(); } catch { return ''; } };
+            const head = (process.env.HEAD_SHA || '').slice(0, 10);
+            const pairs = process.env.PAIRS;
+            const gpu = (process.env.GPU_NAME || '').replace('_', ' ');
+            const price = process.env.OFFER_PRICE;
+
+            let body = `## GPU Benchmark (ABBA) — \`${head}\` vs \`main\` (${pairs} pairs)\n\n`;
+            body += `<sub>${gpu} · Vast.ai datacenter${price ? ` @ \$${price}/hr` : ''} · \`prover/cuda\` · drift-free A/B/B/A</sub>\n\n`;
+            if (process.env.OUTCOME === 'success') {
+              const res = read(`${tmp}/abba_result.txt`) || read(`${tmp}/abba_out.txt`);
+              body += '```\n' + res + '\n```\n';
+              body += '\n<sub>+ = PR faster. Trust the verdict when paired-t and Wilcoxon agree.</sub>\n';
+            } else {
+              const tail = read(`${tmp}/abba_out.txt`).split('\n').slice(-30).join('\n');
+              body += `❌ Run failed. Last log lines:\n\n` + '```\n' + tail + '\n```\n';
+            }
+
+            const { data: comments } = await github.rest.issues.listComments({
+              owner: context.repo.owner, repo: context.repo.repo,
+              issue_number: context.issue.number,
+            });
+            const marker = 'GPU Benchmark (ABBA)';
+            const existing = comments.find(c => c.user.type === 'Bot' && c.body.includes(marker));
+            if (existing) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner, repo: context.repo.repo,
+                comment_id: existing.id, body,
+              });
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner, repo: context.repo.repo,
+                issue_number: context.issue.number, body,
+              });
+            }
+
+      # --- Teardown: ALWAYS destroy the instance (cost guardrail) ---
+      - name: Destroy instance
+        if: always()
+        run: |
+          if [ -f "$RUNNER_TEMP/vast_instance_id" ]; then
+            IID=$(cat "$RUNNER_TEMP/vast_instance_id")
+            echo "Destroying instance $IID"
+            # --yes: skip the interactive [y/N] confirm (CI has no tty).
+            vastai destroy instance "$IID" --yes || echo "::warning::destroy instance $IID failed — check the Vast console"
+          else
+            echo "No instance id recorded; nothing to destroy."
+          fi
diff --git a/.github/workflows/benchmark-pr.yml b/.github/workflows/benchmark-pr.yml
index 57169967d..0ef6ecfd2 100644
--- a/.github/workflows/benchmark-pr.yml
+++ b/.github/workflows/benchmark-pr.yml
@@ -60,6 +60,7 @@ jobs:
        github.event.issue.pull_request &&
        startsWith(github.event.comment.body, '/bench') &&
        !startsWith(github.event.comment.body, '/bench-abba') &&
+       !startsWith(github.event.comment.body, '/bench-gpu') &&
        contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association))
     steps:
       - name: React to comment
diff --git a/scripts/bench_abba.sh b/scripts/bench_abba.sh
index 79bfddf27..57fab5e28 100755
--- a/scripts/bench_abba.sh
+++ b/scripts/bench_abba.sh
@@ -27,6 +27,8 @@
 #     REF_B    baseline   (default: origin/main)
 #     N_PAIRS  pairs      (default: 20 -> 40 runs, ~33 min on ethrex)
 #   Env: REBUILD=1 forces a rebuild even if cached binaries exist.
+#        BENCH_FEATURES=<list> cargo features for the cli build (default: jemalloc-stats).
+#          The GPU ABBA workflow passes "jemalloc-stats,prover/cuda" to bench the GPU path.
 #
 #   Sizing (ethrex pair-noise sd ~1.2%, 80% power): ~12 pairs for a 1% effect,
 #   ~18 for 0.8%, ~32 for 0.6%. Default 20 -> solid on 0.8-1%, ~60% power at 0.6%
@@ -45,6 +47,9 @@ fi
 REF_A="$1"
 REF_B="${2:-origin/main}"
 N_PAIRS="${3:-20}"
+# cli build features. Default matches the CPU bench; the GPU ABBA workflow overrides
+# with "jemalloc-stats,prover/cuda" to exercise the CUDA prover path.
+BENCH_FEATURES="${BENCH_FEATURES:-jemalloc-stats}"
 
 ELF_REL="executor/program_artifacts/rust/ethrex.elf"
 INPUT_REL="executor/tests/ethrex_bench_20.bin"
@@ -102,9 +107,9 @@ if [ "$need_build" = "1" ]; then
   echo "==> Building both prover binaries in isolated worktree $WT"
   git worktree add --detach "$WT" "$SHA_B" >/dev/null
   build_cli() {  # $1=sha $2=out (shared target dir -> 2nd build is incremental)
-    echo "==> Building cli @ ${1:0:10} -> $2"
+    echo "==> Building cli @ ${1:0:10} -> $2  (features: $BENCH_FEATURES)"
     git -C "$WT" checkout --quiet "$1"
-    if ! ( cd "$WT" && cargo build --release -p cli --features jemalloc-stats >"$WORK/build_$2.log" 2>&1 ); then
+    if ! ( cd "$WT" && cargo build --release -p cli --features "$BENCH_FEATURES" >"$WORK/build_$2.log" 2>&1 ); then
       echo "ERROR: cargo build failed for $2 (@ ${1:0:10}). Tail of $WORK/build_$2.log:" >&2
       tail -40 "$WORK/build_$2.log" >&2
       exit 1