From 4ba86e9f4fe5ac17efce80a5f1bc952a825c2a85 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:35:20 +0000 Subject: [PATCH 01/16] docs: add golden-vector parity spec and implementation plan (PLT-1735) Co-Authored-By: Claude Sonnet 4.6 --- .../plans/2026-06-18-golden-vector-parity.md | 1147 +++++++++++++++++ .../2026-06-18-golden-vector-parity-design.md | 389 ++++++ 2 files changed, 1536 insertions(+) create mode 100644 docs/metamorphic/plans/2026-06-18-golden-vector-parity.md create mode 100644 docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md diff --git a/docs/metamorphic/plans/2026-06-18-golden-vector-parity.md b/docs/metamorphic/plans/2026-06-18-golden-vector-parity.md new file mode 100644 index 0000000..42d1010 --- /dev/null +++ b/docs/metamorphic/plans/2026-06-18-golden-vector-parity.md @@ -0,0 +1,1147 @@ +# Golden Vector Parity — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use sensei:subagent-driven-development (recommended) or sensei:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Establish a shared JSON golden-vector fixture (`tests/golden/include_metadata_v0.3.json`) generated by Rust and consumed by Python, with CI drift-detection and automated release workflows for both repos. + +**Architecture:** Phase 1 implements the Rust side: a `emit_golden_metadata` binary writes the fixture, and `tests/golden_vectors.rs` validates it on every `cargo test`. Phase 2 implements the Python side: `tests/test_golden_parity_metadata.py` parametrizes over the same JSON fixture, and a `golden-sync-check` CI job fetches the fixture from the Rust repo on every PR to detect drift. Both repos gain a `workflow_dispatch` release workflow backed by `cargo-release` (Rust) and a git-tag push (Python). + +**Tech Stack:** Rust 1.91.1, `arrow-ipc 57.0.0`, `base64 0.22`, `serde_json`, `cargo-release`; Python 3.10+, `pyarrow ≥ 14`, `pytest`, `uv`; GitHub Actions `actions/create-github-app-token@v3`. + +**Spec:** `docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md` + +--- + +## File Map + +### `nauticalab/starfix` (Rust — authoritative) + +| File | Action | Responsibility | +|---|---|---| +| `Cargo.toml` | Modify | Add `arrow-ipc`, `base64` deps; enable serde derive; bump version to `0.3.0`; register new bin | +| `release.toml` | Create | `cargo-release` config: commit message, tag format, no crates.io publish | +| `src/bin/emit_golden_metadata.rs` | Create | CLI tool that builds all 9 vectors and writes `include_metadata_v0.3.json` to stdout | +| `tests/golden/include_metadata_v0.3.json` | Create (generated) | Committed fixture generated by the binary above | +| `tests/golden_vectors.rs` | Create | Regression guard: reads committed fixture, re-runs hashes, asserts parity | +| `.github/workflows/maturin-release.yml` | Modify | Add `verify-version-tag-sync` job | +| `.github/workflows/release.yml` | Create | Manual `workflow_dispatch` release using `cargo-release` + GitHub App token | + +### `nauticalab/starfix-python` (Python — consumer) + +| File | Action | Responsibility | +|---|---|---| +| `tests/golden/include_metadata_v0.3.json` | Create (copied) | Exact copy of the Rust-generated fixture | +| `tests/test_golden_parity_metadata.py` | Create | Parametrized parity tests; one pytest case per vector | +| `.github/workflows/ci.yml` | Modify | Add `golden-sync-check` job | +| `.github/workflows/release.yml` | Create | Manual `workflow_dispatch` release via git tag + GitHub App token | + +--- + +## Phase 1: Rust (`nauticalab/starfix`) + +Working directory: `/home/kurouto/kurouto-jobs/c2e6a292-30cd-4f99-9c0b-5fb08c4d7ddf/starfix` +Branch: `eywalker/plt-1735-starfix-starfix-python-cross-language-hash-parity-via-golden` (already created) + +--- + +### Task 1: Update `Cargo.toml` + +**Files:** +- Modify: `Cargo.toml` + +- [ ] **Step 1: Update `Cargo.toml`** + +Apply all four changes at once: version bump, serde derive feature, new deps, new bin registration. + +```toml +[package] +name = "starfix" +version = "0.3.0" # ← bumped from 0.1.0 +# … rest unchanged … + +[dependencies] +# … existing deps unchanged … +serde = { version = "1.0.228", features = ["derive"] } # ← add features = ["derive"] +arrow-ipc = { version = "57.0.0" } # ← new +base64 = "0.22" # ← new + +# … existing [[bin]] for uniffi-bindgen unchanged … + +[[bin]] +name = "emit_golden_metadata" # ← new +``` + +- [ ] **Step 2: Verify it compiles** + +```bash +cargo build 2>&1 | head -20 +``` + +Expected: no errors (warnings about unused imports are fine at this stage). + +- [ ] **Step 3: Commit** + +```bash +git add Cargo.toml Cargo.lock +git commit -m "chore: bump to v0.3.0; add arrow-ipc, base64, serde derive" +``` + +--- + +### Task 2: Create `src/bin/emit_golden_metadata.rs` + +**Files:** +- Create: `src/bin/emit_golden_metadata.rs` + +- [ ] **Step 1: Write the binary** + +```rust +//! Golden metadata fixture generator for PLT-1735. +//! +//! # Usage +//! +//! ```bash +//! cargo run --bin emit_golden_metadata > tests/golden/include_metadata_v0.3.json +//! ``` +//! +//! Then copy the output file to `starfix-python/tests/golden/include_metadata_v0.3.json`. +//! +//! # When to regenerate +//! +//! Only regenerate when the hash algorithm changes intentionally. The committed fixture +//! is the authoritative source — `cargo test` reads it and will fail if the hasher +//! output no longer matches. + +#![expect(clippy::unwrap_used, reason = "CLI tool — panics are acceptable")] + +use std::collections::HashMap; +use std::io::Cursor; +use std::sync::Arc; + +use arrow::array::RecordBatch; +use arrow_ipc::writer::StreamWriter; +use arrow_schema::{DataType, Field, Schema}; +use base64::Engine as _; +use serde::{Deserialize, Serialize}; +use starfix::{ArrowDigester, HasherConfig}; + +#[derive(Serialize, Deserialize)] +struct GoldenFixture { + version: String, + generated_by: String, + rust_commit: String, + vectors: Vec, +} + +#[derive(Serialize, Deserialize)] +struct GoldenVector { + id: String, + description: String, + method: String, + include_metadata: bool, + ipc_b64: String, + expected_hash: String, +} + +fn git_sha() -> String { + std::process::Command::new("git") + .args(["rev-parse", "HEAD"]) + .output() + .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string()) + .unwrap_or_else(|_| "unknown".to_string()) +} + +fn schema_to_ipc_b64(schema: &Schema) -> String { + let mut buf: Vec = Vec::new(); + let mut writer = StreamWriter::try_new(&mut buf, schema).unwrap(); + writer.finish().unwrap(); + base64::engine::general_purpose::STANDARD.encode(&buf) +} + +fn hash_schema_hex(schema: &Schema, include_metadata: bool) -> String { + let config = HasherConfig { include_metadata }; + hex::encode(ArrowDigester::hash_schema(schema, config)) +} + +fn meta(pairs: &[(&str, &str)]) -> HashMap { + pairs + .iter() + .map(|&(k, v)| (k.to_owned(), v.to_owned())) + .collect() +} + +fn main() { + let mut vectors: Vec = Vec::new(); + + // ── 1. no_metadata_include_false ──────────────────────────────────────── + { + let schema = Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("name", DataType::LargeUtf8, true), + ]); + vectors.push(GoldenVector { + id: "no_metadata_include_false".to_owned(), + description: "Schema {id: Int64, name: LargeUtf8}, no metadata, include_metadata=false" + .to_owned(), + method: "hash_schema".to_owned(), + include_metadata: false, + ipc_b64: schema_to_ipc_b64(&schema), + expected_hash: hash_schema_hex(&schema, false), + }); + } + + // ── 2. schema_level_metadata ───────────────────────────────────────────── + { + let schema = Schema::new_with_metadata( + vec![Field::new("id", DataType::Int64, false)], + meta(&[("version", "2")]), + ); + vectors.push(GoldenVector { + id: "schema_level_metadata".to_owned(), + description: "Schema-level metadata {version: 2}, include_metadata=true".to_owned(), + method: "hash_schema".to_owned(), + include_metadata: true, + ipc_b64: schema_to_ipc_b64(&schema), + expected_hash: hash_schema_hex(&schema, true), + }); + } + + // ── 3. field_metadata_single_field ─────────────────────────────────────── + { + let schema = Schema::new(vec![ + Field::new("x", DataType::Int32, false).with_metadata(meta(&[("unit", "kg")])), + ]); + vectors.push(GoldenVector { + id: "field_metadata_single_field".to_owned(), + description: "Single field x: Int32 with metadata {unit: kg}, include_metadata=true" + .to_owned(), + method: "hash_schema".to_owned(), + include_metadata: true, + ipc_b64: schema_to_ipc_b64(&schema), + expected_hash: hash_schema_hex(&schema, true), + }); + } + + // ── 4. field_metadata_multiple_fields ──────────────────────────────────── + { + let schema = Schema::new(vec![ + Field::new("x", DataType::Int32, false).with_metadata(meta(&[("unit", "kg")])), + Field::new("y", DataType::Float64, false).with_metadata(meta(&[("unit", "m")])), + ]); + vectors.push(GoldenVector { + id: "field_metadata_multiple_fields".to_owned(), + description: "Two fields x:{unit:kg}, y:{unit:m}, include_metadata=true".to_owned(), + method: "hash_schema".to_owned(), + include_metadata: true, + ipc_b64: schema_to_ipc_b64(&schema), + expected_hash: hash_schema_hex(&schema, true), + }); + } + + // ── 5. schema_and_field_metadata ───────────────────────────────────────── + { + let schema = Schema::new_with_metadata( + vec![ + Field::new("x", DataType::Int32, false).with_metadata(meta(&[("unit", "kg")])), + ], + meta(&[("version", "1")]), + ); + vectors.push(GoldenVector { + id: "schema_and_field_metadata".to_owned(), + description: + "Schema metadata {version:1} + field metadata {unit:kg}, include_metadata=true" + .to_owned(), + method: "hash_schema".to_owned(), + include_metadata: true, + ipc_b64: schema_to_ipc_b64(&schema), + expected_hash: hash_schema_hex(&schema, true), + }); + } + + // ── 6. unicode_metadata ─────────────────────────────────────────────────── + { + let schema = Schema::new(vec![ + Field::new("data", DataType::LargeUtf8, false).with_metadata(meta(&[ + ("emoji_key_\u{1F511}", "value_\u{2713}"), + ("\u{4E2D}\u{6587}", "\u{65E5}\u{672C}\u{8A9E}"), + ])), + ]); + vectors.push(GoldenVector { + id: "unicode_metadata".to_owned(), + description: "Field metadata with emoji and CJK keys/values, include_metadata=true" + .to_owned(), + method: "hash_schema".to_owned(), + include_metadata: true, + ipc_b64: schema_to_ipc_b64(&schema), + expected_hash: hash_schema_hex(&schema, true), + }); + } + + // ── 7. key_reorder_canonical ────────────────────────────────────────────── + { + let schema = Schema::new(vec![ + Field::new("x", DataType::Int32, false) + .with_metadata(meta(&[("alpha", "1"), ("beta", "2"), ("gamma", "3")])), + ]); + let expected = hash_schema_hex(&schema, true); + vectors.push(GoldenVector { + id: "key_reorder_canonical".to_owned(), + description: "Field metadata {alpha,beta,gamma} inserted in alphabetical order, include_metadata=true".to_owned(), + method: "hash_schema".to_owned(), + include_metadata: true, + ipc_b64: schema_to_ipc_b64(&schema), + expected_hash: expected, + }); + } + + // ── 8. key_reorder_shuffled ─────────────────────────────────────────────── + // Same logical metadata as canonical; HashMap iteration may produce same IPC bytes + // within one process run, but the hash is always identical — that is the invariant. + { + let schema = Schema::new(vec![ + Field::new("x", DataType::Int32, false) + .with_metadata(meta(&[("gamma", "3"), ("alpha", "1"), ("beta", "2")])), + ]); + vectors.push(GoldenVector { + id: "key_reorder_shuffled".to_owned(), + description: "Same metadata {alpha,beta,gamma} inserted in shuffled order — expected_hash must equal key_reorder_canonical".to_owned(), + method: "hash_schema".to_owned(), + include_metadata: true, + ipc_b64: schema_to_ipc_b64(&schema), + expected_hash: hash_schema_hex(&schema, true), + }); + } + + // ── 9. empty_metadata_invariant ─────────────────────────────────────────── + // No metadata at all. Hash with include_metadata=false and include_metadata=true + // must be identical. We pin include_metadata=false as the fixture entry; + // both Rust and Python tests additionally assert the true variant equals this. + { + let schema = Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("name", DataType::LargeUtf8, true), + ]); + vectors.push(GoldenVector { + id: "empty_metadata_invariant".to_owned(), + description: "Schema with no metadata — hash(include_metadata=false) must equal hash(include_metadata=true)".to_owned(), + method: "hash_schema".to_owned(), + include_metadata: false, + ipc_b64: schema_to_ipc_b64(&schema), + expected_hash: hash_schema_hex(&schema, false), + }); + } + + let fixture = GoldenFixture { + version: "0.3".to_owned(), + generated_by: "cargo run --bin emit_golden_metadata".to_owned(), + rust_commit: git_sha(), + vectors, + }; + + println!("{}", serde_json::to_string_pretty(&fixture).unwrap()); +} +``` + +- [ ] **Step 2: Verify the binary compiles** + +```bash +cargo build --bin emit_golden_metadata 2>&1 +``` + +Expected: compiles with no errors. Clippy warnings about unused imports are resolved in the next step. + +- [ ] **Step 3: Run clippy to catch any issues** + +```bash +cargo clippy --bin emit_golden_metadata -- -D warnings 2>&1 +``` + +Fix any issues before proceeding. + +- [ ] **Step 4: Commit** + +```bash +git add src/bin/emit_golden_metadata.rs +git commit -m "feat: add emit_golden_metadata binary (PLT-1735)" +``` + +--- + +### Task 3: Generate the fixture file + +**Files:** +- Create: `tests/golden/include_metadata_v0.3.json` + +- [ ] **Step 1: Create the golden directory and generate the fixture** + +```bash +mkdir -p tests/golden +cargo run --bin emit_golden_metadata > tests/golden/include_metadata_v0.3.json +``` + +- [ ] **Step 2: Inspect the output** + +```bash +cat tests/golden/include_metadata_v0.3.json | python3 -m json.tool | head -60 +``` + +Expected: valid JSON with `version`, `generated_by`, `rust_commit`, and `vectors` array of 9 entries. Verify each entry has all 6 fields (`id`, `description`, `method`, `include_metadata`, `ipc_b64`, `expected_hash`). + +- [ ] **Step 3: Verify `key_reorder_canonical` and `key_reorder_shuffled` have the same `expected_hash`** + +```bash +python3 -c " +import json +f = json.load(open('tests/golden/include_metadata_v0.3.json')) +vecs = {v['id']: v for v in f['vectors']} +canon = vecs['key_reorder_canonical']['expected_hash'] +shuffled = vecs['key_reorder_shuffled']['expected_hash'] +print('canonical:', canon) +print('shuffled: ', shuffled) +assert canon == shuffled, 'MISMATCH — hashes must be identical' +print('OK: hashes match') +" +``` + +Expected: both hashes match. + +- [ ] **Step 4: Verify `empty_metadata_invariant` hash matches `include_metadata=true` result** + +```bash +python3 -c " +import json, base64, struct +f = json.load(open('tests/golden/include_metadata_v0.3.json')) +vecs = {v['id']: v for v in f['vectors']} +inv = vecs['empty_metadata_invariant'] +print('empty_metadata_invariant expected_hash:', inv['expected_hash']) +print('(manually verify this matches hash_schema with include_metadata=true in cargo test)') +" +``` + +- [ ] **Step 5: Commit** + +```bash +git add tests/golden/include_metadata_v0.3.json +git commit -m "feat: add golden fixture tests/golden/include_metadata_v0.3.json (PLT-1735)" +``` + +--- + +### Task 4: Write `tests/golden_vectors.rs` + +**Files:** +- Create: `tests/golden_vectors.rs` + +- [ ] **Step 1: Write the regression test** + +```rust +//! Regression guard for the golden metadata fixture. +//! +//! Reads `tests/golden/include_metadata_v0.3.json`, re-runs `ArrowDigester` on each +//! vector's IPC blob, and asserts that the output matches the committed `expected_hash`. +//! If this test fails, either the fixture is stale (regenerate with +//! `cargo run --bin emit_golden_metadata`) or the hasher has changed unexpectedly. + +#[cfg(test)] +mod tests { + #![expect(clippy::unwrap_used, reason = "Okay in test")] + + use std::io::Cursor; + use std::sync::Arc; + + use arrow_ipc::reader::StreamReader; + use arrow_schema::Schema; + use base64::Engine as _; + use hex::encode; + use serde::Deserialize; + use starfix::{ArrowDigester, HasherConfig}; + + #[derive(Deserialize)] + struct GoldenFixture { + vectors: Vec, + } + + #[derive(Deserialize)] + struct GoldenVector { + id: String, + description: String, + method: String, + include_metadata: bool, + ipc_b64: String, + expected_hash: String, + } + + fn load_fixture() -> GoldenFixture { + let path = concat!( + env!("CARGO_MANIFEST_DIR"), + "/tests/golden/include_metadata_v0.3.json" + ); + let raw = std::fs::read_to_string(path) + .unwrap_or_else(|_| panic!("fixture not found at {path} — run `cargo run --bin emit_golden_metadata > tests/golden/include_metadata_v0.3.json`")); + serde_json::from_str(&raw).unwrap() + } + + fn decode_ipc(ipc_b64: &str) -> (Arc, Option) { + let bytes = base64::engine::general_purpose::STANDARD + .decode(ipc_b64) + .unwrap(); + let cursor = Cursor::new(bytes); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let schema = reader.schema(); + let batch = reader.next().and_then(|r| r.ok()); + (schema, batch) + } + + #[test] + fn golden_vectors_match() { + let fixture = load_fixture(); + assert!( + !fixture.vectors.is_empty(), + "fixture must contain at least one vector" + ); + + for vector in &fixture.vectors { + let (schema, batch) = decode_ipc(&vector.ipc_b64); + let config = HasherConfig { + include_metadata: vector.include_metadata, + }; + + let result = match vector.method.as_str() { + "hash_schema" => encode(ArrowDigester::hash_schema(&schema, config)), + "hash_record_batch" => { + let batch = batch.unwrap_or_else(|| { + panic!( + "vector '{}': method is hash_record_batch but IPC has no batch", + vector.id + ) + }); + encode(ArrowDigester::hash_record_batch(&batch, config)) + } + other => panic!("vector '{}': unknown method '{other}'", vector.id), + }; + + assert_eq!( + result, vector.expected_hash, + "vector '{}' mismatch: {}\n got: {result}\n expected: {}", + vector.id, vector.description, vector.expected_hash + ); + } + } + + #[test] + fn empty_metadata_invariant_both_flags() { + let fixture = load_fixture(); + let vector = fixture + .vectors + .iter() + .find(|v| v.id == "empty_metadata_invariant") + .expect("empty_metadata_invariant vector must exist in fixture"); + + let (schema, _) = decode_ipc(&vector.ipc_b64); + + let hash_false = encode(ArrowDigester::hash_schema( + &schema, + HasherConfig { include_metadata: false }, + )); + let hash_true = encode(ArrowDigester::hash_schema( + &schema, + HasherConfig { include_metadata: true }, + )); + + assert_eq!( + hash_false, vector.expected_hash, + "empty_metadata_invariant: include_metadata=false must match pinned hash" + ); + assert_eq!( + hash_true, vector.expected_hash, + "empty_metadata_invariant: include_metadata=true must match pinned hash" + ); + assert_eq!( + hash_false, hash_true, + "empty_metadata_invariant: both flag values must produce the same hash" + ); + } + + #[test] + fn key_reorder_hashes_are_identical() { + let fixture = load_fixture(); + let vecs: std::collections::HashMap<&str, &GoldenVector> = + fixture.vectors.iter().map(|v| (v.id.as_str(), v)).collect(); + + let canonical = vecs + .get("key_reorder_canonical") + .expect("key_reorder_canonical must exist"); + let shuffled = vecs + .get("key_reorder_shuffled") + .expect("key_reorder_shuffled must exist"); + + assert_eq!( + canonical.expected_hash, shuffled.expected_hash, + "key_reorder_canonical and key_reorder_shuffled must have identical expected_hash" + ); + } +} +``` + +- [ ] **Step 2: Run tests to verify they pass** + +```bash +cargo test --test golden_vectors 2>&1 +``` + +Expected: all 3 tests pass (`golden_vectors_match`, `empty_metadata_invariant_both_flags`, `key_reorder_hashes_are_identical`). + +- [ ] **Step 3: Run full test suite to check nothing is broken** + +```bash +cargo test 2>&1 | tail -20 +``` + +Expected: all tests pass. + +- [ ] **Step 4: Run clippy** + +```bash +cargo clippy --all-targets -- -D warnings 2>&1 +``` + +Expected: no warnings or errors. + +- [ ] **Step 5: Run formatter** + +```bash +cargo fmt --check 2>&1 +``` + +If fails, run `cargo fmt` then recheck. + +- [ ] **Step 6: Commit** + +```bash +git add tests/golden_vectors.rs +git commit -m "test: add golden_vectors regression test (PLT-1735)" +``` + +--- + +### Task 5: Add `release.toml` + +**Files:** +- Create: `release.toml` + +- [ ] **Step 1: Write `release.toml`** + +```toml +# cargo-release configuration for nauticalab/starfix. +# +# Usage: +# cargo release --execute --no-confirm +# +# This atomically bumps Cargo.toml, commits, creates a v git tag, +# and pushes both. The tag push triggers maturin-release.yml which builds +# and publishes Python wheels to PyPI. +pre-release-commit-message = "chore: release v{{version}}" +tag-name = "v{{version}}" +push = true +publish = false +``` + +- [ ] **Step 2: Commit** + +```bash +git add release.toml +git commit -m "chore: add cargo-release config (PLT-1735)" +``` + +--- + +### Task 6: Add `verify-version-tag-sync` to `maturin-release.yml` + +**Files:** +- Modify: `.github/workflows/maturin-release.yml` + +- [ ] **Step 1: Add the enforcement job** + +Add the following job to `.github/workflows/maturin-release.yml` (insert after the existing `test` job, before `linux`): + +```yaml + verify-version-tag-sync: + runs-on: ubuntu-latest + if: startsWith(github.ref, 'refs/tags/') + steps: + - uses: actions/checkout@v4 + - name: Verify Cargo.toml version matches tag + run: | + TAG="${GITHUB_REF#refs/tags/v}" + CARGO_VERSION=$(grep '^version' Cargo.toml | head -1 | sed 's/.*= *"\(.*\)"/\1/') + if [ "$TAG" != "$CARGO_VERSION" ]; then + echo "ERROR: Tag v${TAG} does not match Cargo.toml version ${CARGO_VERSION}" + exit 1 + fi + echo "OK: Tag v${TAG} matches Cargo.toml version ${CARGO_VERSION}" +``` + +Also update the `linux`, `macos`, and `sdist` jobs to add `verify-version-tag-sync` to their `needs`: + +```yaml + linux: + needs: [test, verify-version-tag-sync] + # … rest unchanged … + + macos: + needs: [test, verify-version-tag-sync] + # … rest unchanged … + + sdist: + needs: [test, verify-version-tag-sync] + # … rest unchanged … +``` + +- [ ] **Step 2: Commit** + +```bash +git add .github/workflows/maturin-release.yml +git commit -m "ci: add verify-version-tag-sync job to maturin-release.yml (PLT-1735)" +``` + +--- + +### Task 7: Add `release.yml` workflow (Rust) + +**Files:** +- Create: `.github/workflows/release.yml` + +- [ ] **Step 1: Write the workflow** + +```yaml +name: release + +on: + workflow_dispatch: + inputs: + version: + description: 'Release version (e.g. 0.3.0)' + required: true + type: string + +jobs: + release: + runs-on: ubuntu-latest + steps: + - name: Generate GitHub App token + id: app-token + uses: actions/create-github-app-token@v3 + with: + app-id: ${{ secrets.RELEASE_APP_ID }} + private-key: ${{ secrets.RELEASE_APP_PRIVATE_KEY }} + + - uses: actions/checkout@v4 + with: + token: ${{ steps.app-token.outputs.token }} + fetch-depth: 0 + + - name: Install Rust + uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + toolchain: 1.91.1 + + - name: Install cargo-release + run: cargo install cargo-release + + - name: Configure git + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Release + run: cargo release ${{ inputs.version }} --execute --no-confirm +``` + +- [ ] **Step 2: Commit** + +```bash +git add .github/workflows/release.yml +git commit -m "ci: add manual release workflow using cargo-release (PLT-1735)" +``` + +--- + +### Task 8: Push Rust branch and open PR + +- [ ] **Step 1: Push the branch** + +```bash +git push -u origin eywalker/plt-1735-starfix-starfix-python-cross-language-hash-parity-via-golden +``` + +- [ ] **Step 2: Open the PR** + +```bash +gh pr create \ + --title "feat(PLT-1735): golden vector parity — Rust side" \ + --body "$(cat <<'EOF' +## Summary + +- Adds `emit_golden_metadata` binary that generates `tests/golden/include_metadata_v0.3.json` (9 vectors covering all `include_metadata` scenarios) +- Adds `tests/golden_vectors.rs` regression guard that reads the committed fixture and re-validates every hash on `cargo test` +- Bumps crate version to `0.3.0` +- Adds `release.toml` for `cargo-release` +- Adds `verify-version-tag-sync` CI job to `maturin-release.yml` +- Adds manual `release.yml` workflow using `cargo-release` + GitHub App token + +Part of PLT-1735. The Python side (starfix-python) consumes `tests/golden/include_metadata_v0.3.json` — see companion PR. + +## Test plan +- [ ] `cargo test` passes (including new `golden_vectors` tests) +- [ ] `cargo clippy --all-targets -- -D warnings` clean +- [ ] `cargo fmt --check` clean +- [ ] Fixture JSON has 9 entries; `key_reorder_canonical` and `key_reorder_shuffled` share the same `expected_hash` + +🤖 Generated with [Claude Code](https://claude.com/claude-code) +EOF +)" \ + --base dev 2>/dev/null || \ +gh pr create \ + --title "feat(PLT-1735): golden vector parity — Rust side" \ + --body "Part of PLT-1735. Golden fixture + regression tests + release infra." \ + --base main +``` + +--- + +## Phase 2: Python (`nauticalab/starfix-python`) + +Working directory: `/home/kurouto/kurouto-jobs/c2e6a292-30cd-4f99-9c0b-5fb08c4d7ddf/starfix-python` + +--- + +### Task 9: Create branch and copy fixture + +**Files:** +- Create: `tests/golden/include_metadata_v0.3.json` +- Create: `docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md` (already done in brainstorming) +- Create: `docs/metamorphic/plans/2026-06-18-golden-vector-parity.md` (this file) + +- [ ] **Step 1: Create and check out the branch** + +```bash +cd /home/kurouto/kurouto-jobs/c2e6a292-30cd-4f99-9c0b-5fb08c4d7ddf/starfix-python +git checkout -b eywalker/plt-1735-starfix-starfix-python-cross-language-hash-parity-via-golden +``` + +- [ ] **Step 2: Copy the fixture from the Rust repo** + +```bash +mkdir -p tests/golden +cp ../starfix/tests/golden/include_metadata_v0.3.json tests/golden/include_metadata_v0.3.json +``` + +- [ ] **Step 3: Verify the fixture is valid** + +```bash +python3 -m json.tool tests/golden/include_metadata_v0.3.json | head -20 +``` + +Expected: valid JSON with 9 vectors. + +- [ ] **Step 4: Commit** + +```bash +git add tests/golden/include_metadata_v0.3.json \ + docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md \ + docs/metamorphic/plans/2026-06-18-golden-vector-parity.md +git commit -m "docs: add golden-vector parity spec, plan, and fixture (PLT-1735)" +``` + +--- + +### Task 10: Write `tests/test_golden_parity_metadata.py` + +**Files:** +- Create: `tests/test_golden_parity_metadata.py` + +- [ ] **Step 1: Write the test file** + +```python +"""Cross-language golden-hash parity tests for include_metadata vectors (PLT-1735). + +Every expected hash in `tests/golden/include_metadata_v0.3.json` was generated by +`cargo run --bin emit_golden_metadata` in the Rust starfix crate and is treated as +the authoritative source of truth. These tests will fail immediately if the Python +implementation diverges from Rust. + +Covers 9 vectors: + - no_metadata_include_false (regression: same as v0.1.0 schema hash) + - schema_level_metadata (schema-level metadata, include_metadata=True) + - field_metadata_single_field (single field with metadata, include_metadata=True) + - field_metadata_multiple_fields (two fields with metadata, include_metadata=True) + - schema_and_field_metadata (both schema- and field-level, include_metadata=True) + - unicode_metadata (emoji + CJK keys/values, include_metadata=True) + - key_reorder_canonical (metadata keys in alphabetical order) + - key_reorder_shuffled (same keys, different insertion order — same hash) + - empty_metadata_invariant (no metadata — hash must be same for both flag values) +""" +from __future__ import annotations + +import base64 +import io +import json +from pathlib import Path +from typing import Any + +import pyarrow as pa +import pytest + +from starfix.arrow_digester import ArrowDigester + +_FIXTURE_PATH = Path(__file__).parent / "golden" / "include_metadata_v0.3.json" + + +def _load_vectors() -> list[dict[str, Any]]: + with _FIXTURE_PATH.open() as f: + return json.load(f)["vectors"] + + +def _get_vector(vector_id: str) -> dict[str, Any]: + for v in _load_vectors(): + if v["id"] == vector_id: + return v + raise KeyError(f"Vector '{vector_id}' not found in fixture") + + +def _deserialize_ipc(ipc_b64: str) -> tuple[pa.Schema, pa.RecordBatch | None]: + raw = base64.b64decode(ipc_b64) + reader = pa.ipc.open_stream(io.BytesIO(raw)) + schema = reader.schema_arrow + batches = list(reader) + batch = batches[0] if batches else None + return schema, batch + + +@pytest.mark.parametrize("vector", _load_vectors(), ids=lambda v: v["id"]) +def test_golden_vector(vector: dict[str, Any]) -> None: + """Each vector must produce the Rust-authoritative expected_hash.""" + schema, batch = _deserialize_ipc(vector["ipc_b64"]) + include_metadata: bool = vector["include_metadata"] + + if vector["method"] == "hash_schema": + result = ArrowDigester.hash_schema(schema, include_metadata=include_metadata) + elif vector["method"] == "hash_record_batch": + assert batch is not None, ( + f"Vector '{vector['id']}': method is hash_record_batch but IPC has no batch" + ) + result = ArrowDigester.hash_record_batch(batch, include_metadata=include_metadata) + else: + pytest.fail(f"Vector '{vector['id']}': unknown method '{vector['method']}'") + + assert result.hex() == vector["expected_hash"], ( + f"Vector '{vector['id']}' mismatch: {vector['description']}\n" + f" got: {result.hex()}\n" + f" expected: {vector['expected_hash']}" + ) + + +def test_empty_metadata_invariant_both_flags() -> None: + """Empty-metadata invariant: include_metadata=True must produce the same hash as False.""" + vector = _get_vector("empty_metadata_invariant") + schema, _ = _deserialize_ipc(vector["ipc_b64"]) + + hash_false = ArrowDigester.hash_schema(schema, include_metadata=False).hex() + hash_true = ArrowDigester.hash_schema(schema, include_metadata=True).hex() + + assert hash_false == vector["expected_hash"], ( + f"empty_metadata_invariant: include_metadata=False got {hash_false}, " + f"expected {vector['expected_hash']}" + ) + assert hash_true == vector["expected_hash"], ( + f"empty_metadata_invariant: include_metadata=True got {hash_true}, " + f"expected {vector['expected_hash']}" + ) + assert hash_false == hash_true, ( + "empty_metadata_invariant: both flag values must produce the same hash" + ) + + +def test_key_reorder_hashes_are_identical() -> None: + """key_reorder_canonical and key_reorder_shuffled must share the same expected_hash.""" + canonical = _get_vector("key_reorder_canonical") + shuffled = _get_vector("key_reorder_shuffled") + assert canonical["expected_hash"] == shuffled["expected_hash"], ( + "key_reorder_canonical and key_reorder_shuffled must have identical expected_hash\n" + f" canonical: {canonical['expected_hash']}\n" + f" shuffled: {shuffled['expected_hash']}" + ) +``` + +- [ ] **Step 2: Run the tests to verify they all pass** + +```bash +cd /home/kurouto/kurouto-jobs/c2e6a292-30cd-4f99-9c0b-5fb08c4d7ddf/starfix-python +uv run pytest tests/test_golden_parity_metadata.py -v 2>&1 +``` + +Expected: all 11 tests pass (9 parametrized `test_golden_vector` + `test_empty_metadata_invariant_both_flags` + `test_key_reorder_hashes_are_identical`). + +- [ ] **Step 3: Run the full test suite to confirm nothing is broken** + +```bash +uv run pytest tests/ -v 2>&1 | tail -30 +``` + +Expected: all tests pass. + +- [ ] **Step 4: Commit** + +```bash +git add tests/test_golden_parity_metadata.py +git commit -m "test: add cross-language golden parity tests for include_metadata (PLT-1735)" +``` + +--- + +### Task 11: Add `golden-sync-check` to `ci.yml` + +**Files:** +- Modify: `.github/workflows/ci.yml` + +- [ ] **Step 1: Add the job** + +Append the following job to the `jobs:` section of `.github/workflows/ci.yml`: + +```yaml + golden-sync-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Generate GitHub App token + id: app-token + uses: actions/create-github-app-token@v3 + with: + app-id: ${{ secrets.STARFIX_APP_ID }} + private-key: ${{ secrets.STARFIX_APP_PRIVATE_KEY }} + repositories: starfix + + - name: Fetch authoritative fixture from starfix + run: | + gh api repos/nauticalab/starfix/contents/tests/golden/include_metadata_v0.3.json \ + --jq '.content' | base64 -d > /tmp/upstream.json + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + + - name: Fail on fixture drift + run: | + if ! diff tests/golden/include_metadata_v0.3.json /tmp/upstream.json; then + echo "ERROR: tests/golden/include_metadata_v0.3.json has drifted from nauticalab/starfix main." + echo "Copy the updated fixture from the starfix repo and commit it." + exit 1 + fi +``` + +- [ ] **Step 2: Commit** + +```bash +git add .github/workflows/ci.yml +git commit -m "ci: add golden-sync-check drift gate (PLT-1735)" +``` + +--- + +### Task 12: Add `release.yml` workflow (Python) + +**Files:** +- Create: `.github/workflows/release.yml` + +- [ ] **Step 1: Write the workflow** + +```yaml +name: release + +on: + workflow_dispatch: + inputs: + version: + description: 'Release version (e.g. 0.3.0)' + required: true + type: string + +jobs: + release: + runs-on: ubuntu-latest + steps: + - name: Generate GitHub App token + id: app-token + uses: actions/create-github-app-token@v3 + with: + app-id: ${{ secrets.RELEASE_APP_ID }} + private-key: ${{ secrets.RELEASE_APP_PRIVATE_KEY }} + + - uses: actions/checkout@v4 + with: + token: ${{ steps.app-token.outputs.token }} + fetch-depth: 0 + + - name: Configure git + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Tag and push release + run: | + git tag v${{ inputs.version }} + git push origin v${{ inputs.version }} +``` + +Note: the tag push triggers `publish.yml` which runs tests, builds the wheel and sdist via `uv build`, and publishes to TestPyPI then PyPI using OIDC Trusted Publishing. + +Required secrets: `RELEASE_APP_ID`, `RELEASE_APP_PRIVATE_KEY` (GitHub App with `contents:write` on `nauticalab/starfix-python`). + +- [ ] **Step 2: Commit** + +```bash +git add .github/workflows/release.yml +git commit -m "ci: add manual release workflow (PLT-1735)" +``` + +--- + +### Task 13: Push Python branch and open PR + +- [ ] **Step 1: Push the branch** + +```bash +git push -u origin eywalker/plt-1735-starfix-starfix-python-cross-language-hash-parity-via-golden +``` + +- [ ] **Step 2: Open the PR** + +```bash +gh pr create \ + --title "feat(PLT-1735): golden vector parity — Python side" \ + --body "$(cat <<'EOF' +## Summary + +- Adds `tests/golden/include_metadata_v0.3.json` (copied from authoritative Rust fixture) +- Adds `tests/test_golden_parity_metadata.py`: 9 parametrized cross-language parity tests + 2 invariant tests +- Adds `golden-sync-check` CI job to `ci.yml` — fetches fixture from `nauticalab/starfix` main on every PR and fails on drift +- Adds manual `release.yml` workflow using GitHub App token + +Requires `STARFIX_APP_ID` + `STARFIX_APP_PRIVATE_KEY` secrets (for drift check) and `RELEASE_APP_ID` + `RELEASE_APP_PRIVATE_KEY` secrets (for release workflow) to be set on the repo. + +Part of PLT-1735. Companion PR: nauticalab/starfix (Rust side). + +## Test plan +- [ ] `uv run pytest tests/ -v` passes (all golden parity tests green) +- [ ] `golden-sync-check` CI job passes (requires secrets configured) +- [ ] `test_empty_metadata_invariant_both_flags` passes +- [ ] `test_key_reorder_hashes_are_identical` passes + +🤖 Generated with [Claude Code](https://claude.com/claude-code) +EOF +)" \ + --base dev 2>/dev/null || \ +gh pr create \ + --title "feat(PLT-1735): golden vector parity — Python side" \ + --body "Part of PLT-1735. Golden fixture + parity tests + drift-check CI + release workflow." \ + --base main +``` diff --git a/docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md b/docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md new file mode 100644 index 0000000..5ecd1bc --- /dev/null +++ b/docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md @@ -0,0 +1,389 @@ +# Cross-Language Hash Parity via Golden Vectors + +**Issue:** PLT-1735 +**Date:** 2026-06-18 +**Status:** Approved +**Repos:** `nauticalab/starfix` (authoritative), `nauticalab/starfix-python` (consumer) + +--- + +## Overview + +Both `starfix` (Rust) and `starfix-python` now implement `include_metadata` hashing (PLT-1733, +PLT-1734). This spec establishes a shared golden-vector fixture that proves the two +implementations produce bit-for-bit identical hashes for the same Arrow inputs. Rust is the +authoritative source; Python must match it exactly. + +--- + +## Fixture Format + +A single JSON file committed to both repos at: + +``` +tests/golden/include_metadata_v0.3.json +``` + +Top-level structure: + +```json +{ + "version": "0.3", + "generated_by": "cargo run --bin emit_golden_metadata", + "rust_commit": "", + "vectors": [ ... ] +} +``` + +Each entry in `vectors`: + +| Field | Type | Description | +|---|---|---| +| `id` | string | Unique slug (used as pytest test ID) | +| `description` | string | Human-readable summary of what this vector tests | +| `method` | string | `"hash_schema"` or `"hash_record_batch"` | +| `include_metadata` | bool | Value passed to the hasher | +| `ipc_b64` | string | Base64-encoded Arrow IPC stream (schema + optional rows) | +| `expected_hash` | string | Rust-authoritative hex-encoded hash digest | + +Arrow IPC is used for `ipc_b64` because it captures the exact bytes — including metadata key +insertion order — that were fed to the Rust hasher. This eliminates any risk of Python +constructing subtly different Arrow data. + +--- + +## Required Vectors + +| id | Scenario | `include_metadata` | +|---|---|---| +| `no_metadata_include_false` | `{id: Int64, name: LargeUtf8}`, no metadata | `false` | +| `schema_level_metadata` | Schema with `{"version": "2"}` at schema level | `true` | +| `field_metadata_single_field` | One field with `{"unit": "kg"}` | `true` | +| `field_metadata_multiple_fields` | Two fields each with distinct metadata | `true` | +| `schema_and_field_metadata` | Both schema-level and field-level metadata | `true` | +| `unicode_metadata` | Emoji + CJK keys/values | `true` | +| `key_reorder_canonical` | Field metadata keys in alphabetical order | `true` | +| `key_reorder_shuffled` | Same keys, different insertion order — **same `expected_hash` as `key_reorder_canonical`** | `true` | +| `empty_metadata_invariant` | No metadata at all — tested with `include_metadata=false`; `expected_hash` must equal that of the same schema hashed with `include_metadata=true` | `false` | + +The `key_reorder_canonical` / `key_reorder_shuffled` pair encodes the key-ordering determinism +invariant directly in the fixture: two different IPC blobs (different insertion orders) map to +the same `expected_hash`. + +The `empty_metadata_invariant` entry pins the empty-metadata fixed point: a schema with no +metadata must produce the same hash regardless of `include_metadata`. Only one entry is needed +because both flag values produce the same hash by definition; a second entry would be +redundant. The Rust and Python tests assert `hash(schema, false) == hash(schema, true) == +expected_hash`. + +--- + +## Rust Side (`nauticalab/starfix`) + +### `src/bin/emit_golden_metadata.rs` + +Developer tool. Generates the fixture to stdout: + +``` +cargo run --bin emit_golden_metadata > tests/golden/include_metadata_v0.3.json +cargo fmt +``` + +Responsibilities: +- Constructs each Arrow schema/batch for the 9 vectors above +- Serialises each to an Arrow IPC stream, base64-encodes it +- Calls `ArrowDigester` to produce the authoritative hash +- Writes the complete JSON to stdout +- Embeds a `rust_commit` field by running `git rev-parse HEAD` via `std::process::Command` at generation time + +The file header contains a comment documenting the full regeneration procedure (see +§ Regeneration Workflow below). + +### `tests/golden_vectors.rs` + +Regression guard. Runs as part of `cargo test` (covered by the `test` job in +`maturin-release.yml`). + +For each entry in the committed fixture: +1. Decodes `ipc_b64` → Arrow IPC stream +2. Reads schema (and batch, if present) +3. Calls `ArrowDigester::hash_schema` or `ArrowDigester::hash_record_batch` with `include_metadata` +4. Asserts `hex::encode(result) == entry.expected_hash` + +On failure, the panic message includes `id` and `description` for immediate identification. + +Additionally, the test explicitly verifies the empty-metadata invariant by asserting: + +```rust +assert_eq!( + hash(schema, include_metadata=false), + hash(schema, include_metadata=true), + "empty_metadata_invariant: hash must be equal regardless of include_metadata" +); +``` + +--- + +## Python Side (`nauticalab/starfix-python`) + +### `tests/golden/include_metadata_v0.3.json` + +Exact copy of the Rust-generated fixture. Committed alongside existing test files. Updated +whenever the Rust fixture is regenerated (see § Regeneration Workflow). + +### `tests/test_golden_parity_metadata.py` + +Parametrized test file. Each vector becomes one pytest case, identified by its `id` slug: + +```python +@pytest.mark.parametrize("vector", _load_vectors(), ids=lambda v: v["id"]) +def test_golden_vector(vector): + schema, batch = _deserialize_ipc(vector["ipc_b64"]) + include_metadata = vector["include_metadata"] + if vector["method"] == "hash_schema": + result = ArrowDigester.hash_schema(schema, include_metadata=include_metadata) + else: + result = ArrowDigester.hash_record_batch(batch, include_metadata=include_metadata) + assert result.hex() == vector["expected_hash"], ( + f"Vector '{vector['id']}' mismatch: {vector['description']}" + ) +``` + +The `empty_metadata_invariant` vector is additionally tested with `include_metadata=True` in a +dedicated assertion that reads the same `expected_hash`: + +```python +def test_empty_metadata_invariant_both_flags(): + # Load the empty_metadata_invariant vector and verify both flag values produce + # the same Rust-authoritative hash. + vector = _get_vector("empty_metadata_invariant") + schema, _ = _deserialize_ipc(vector["ipc_b64"]) + hash_false = ArrowDigester.hash_schema(schema, include_metadata=False).hex() + hash_true = ArrowDigester.hash_schema(schema, include_metadata=True).hex() + assert hash_false == vector["expected_hash"] + assert hash_true == vector["expected_hash"] +``` + +### `golden-sync-check` job in `.github/workflows/ci.yml` + +Prevents the committed fixture from drifting from the Rust authoritative source. Runs on every +PR and push to `main`. + +Uses `actions/create-github-app-token@v3` (GitHub-owned action) to generate a short-lived +installation token from a GitHub App with `contents:read` permission on `nauticalab/starfix`. + +Required secrets in `starfix-python`: +- `STARFIX_APP_ID` — numeric GitHub App ID +- `STARFIX_APP_PRIVATE_KEY` — PEM private key + +```yaml +golden-sync-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Generate GitHub App token + id: app-token + uses: actions/create-github-app-token@v3 + with: + app-id: ${{ secrets.STARFIX_APP_ID }} + private-key: ${{ secrets.STARFIX_APP_PRIVATE_KEY }} + repositories: starfix + + - name: Fetch authoritative fixture from starfix + run: | + gh api repos/nauticalab/starfix/contents/tests/golden/include_metadata_v0.3.json \ + --jq '.content' | base64 -d > /tmp/upstream.json + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + + - name: Fail on fixture drift + run: diff tests/golden/include_metadata_v0.3.json /tmp/upstream.json +``` + +--- + +## Regeneration Workflow + +When the Rust hasher changes and the fixture must be updated: + +1. In `starfix`: `cargo run --bin emit_golden_metadata > tests/golden/include_metadata_v0.3.json` +2. Run `cargo fmt` and verify `cargo test` passes (the `golden_vectors` test will validate the new file) +3. Commit the updated fixture and merge to `main` +4. In `starfix-python`: copy the file to `tests/golden/include_metadata_v0.3.json` and commit +5. The `golden-sync-check` CI job gates the Python PR — it will fail until the committed copy matches `starfix` main + +--- + +## Version Alignment + +Both repos are bumped to `v0.3.0` as part of this work. The hash format byte prefix +(`[0, 0, 1]` — hash spec version 0.0.1) is unchanged; this is a package version bump only. + +### Why `Cargo.toml` version and git tags are kept in sync + +Cargo requires a hardcoded version in `Cargo.toml`; there is no `hatch-vcs`-style +auto-derivation from git tags. The invariant is enforced instead by using `cargo-release`, +which atomically bumps `Cargo.toml`, commits the change, creates the matching git tag, and +pushes both — making it structurally impossible to tag without also updating `Cargo.toml`. + +### `release.toml` (new file, `nauticalab/starfix`) + +```toml +pre-release-commit-message = "chore: release v{{version}}" +tag-name = "v{{version}}" +push = true +publish = false # wheels go via maturin, not crates.io +``` + +### `cargo-release` CI enforcement (new job in `maturin-release.yml`) + +A lightweight check that runs on every tag push and fails if the tag name does not match +the version in `Cargo.toml`: + +```yaml +verify-version-tag-sync: + runs-on: ubuntu-latest + if: startsWith(github.ref, 'refs/tags/') + steps: + - uses: actions/checkout@v4 + - name: Verify Cargo.toml version matches tag + run: | + TAG="${GITHUB_REF#refs/tags/v}" + CARGO_VERSION=$(grep '^version' Cargo.toml | head -1 | sed 's/.*= *"\(.*\)"/\1/') + if [ "$TAG" != "$CARGO_VERSION" ]; then + echo "Tag $TAG does not match Cargo.toml version $CARGO_VERSION" + exit 1 + fi +``` + +### Manually-triggered release workflows + +#### `nauticalab/starfix` — new `.github/workflows/release.yml` + +```yaml +name: release +on: + workflow_dispatch: + inputs: + version: + description: 'Release version (e.g. 0.3.0)' + required: true + type: string +jobs: + release: + runs-on: ubuntu-latest + steps: + - name: Generate GitHub App token + id: app-token + uses: actions/create-github-app-token@v3 + with: + app-id: ${{ secrets.RELEASE_APP_ID }} + private-key: ${{ secrets.RELEASE_APP_PRIVATE_KEY }} + + - uses: actions/checkout@v4 + with: + token: ${{ steps.app-token.outputs.token }} + fetch-depth: 0 + + - uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + toolchain: 1.91.1 + + - run: cargo install cargo-release + + - name: Configure git + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Release + run: cargo release ${{ inputs.version }} --execute --no-confirm +``` + +A GitHub App token (rather than `GITHUB_TOKEN`) is required for checkout so that the tag +push from `cargo-release` triggers the downstream `maturin-release.yml` workflow. +`GITHUB_TOKEN`-pushed events do not trigger other workflows (GitHub's recursion guard). + +Required secrets: `RELEASE_APP_ID`, `RELEASE_APP_PRIVATE_KEY` — a GitHub App with +`contents:write` on `nauticalab/starfix`. + +**What this workflow does end-to-end:** +1. `cargo-release` bumps `Cargo.toml` → commits → creates `v{version}` tag → pushes both +2. Tag push fires `maturin-release.yml` → builds wheels → publishes to PyPI + +#### `nauticalab/starfix-python` — new `.github/workflows/release.yml` + +`hatch-vcs` reads the version from git tags automatically; there is no version file to +bump. The release workflow only needs to create and push the tag: + +```yaml +name: release +on: + workflow_dispatch: + inputs: + version: + description: 'Release version (e.g. 0.3.0)' + required: true + type: string +jobs: + release: + runs-on: ubuntu-latest + steps: + - name: Generate GitHub App token + id: app-token + uses: actions/create-github-app-token@v3 + with: + app-id: ${{ secrets.RELEASE_APP_ID }} + private-key: ${{ secrets.RELEASE_APP_PRIVATE_KEY }} + + - uses: actions/checkout@v4 + with: + token: ${{ steps.app-token.outputs.token }} + + - name: Configure git + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Tag and push release + run: | + git tag v${{ inputs.version }} + git push origin v${{ inputs.version }} +``` + +Tag push fires the existing `publish.yml` → pure-Python package published to PyPI. + +### Release procedure (coordinated across both repos) + +1. Merge the PLT-1735 PRs in both repos to `main` +2. Trigger `starfix` → Actions → **release** → Run workflow → version: `0.3.0` +3. Trigger `starfix-python` → Actions → **release** → Run workflow → version: `0.3.0` +4. Confirm both PyPI packages show `0.3.0` + +--- + +## Out of Scope + +- The `include_metadata` implementation itself (PLT-1733, PLT-1734) +- Cross-version parity (v0.1.0 ↔ v0.2.0) — already covered by existing golden tests +- Future finer-grained metadata controls +- `hash_array` with `include_metadata` — arrays have no schema-level metadata; not applicable + +--- + +## Risks + +- **IPC metadata order:** Arrow IPC preserves key insertion order in its FlatBuffers encoding. + This is load-bearing for the `key_reorder_*` vectors. If a future Arrow version changes this + behaviour the vectors would need to be regenerated, but the fixture format itself remains + valid. +- **Fixture drift:** Mitigated by the `golden-sync-check` CI job. If the GitHub App secret + expires or is revoked, the drift check will fail loudly rather than silently passing. +- **Version/tag sync:** The `verify-version-tag-sync` CI job enforces the invariant on every + tag push. If someone bypasses `cargo-release` and creates a tag manually without bumping + `Cargo.toml`, this job will catch it and fail the `maturin-release.yml` run before any + wheels are built. +- **GitHub App token for release:** Both release workflows require `RELEASE_APP_ID` and + `RELEASE_APP_PRIVATE_KEY` secrets. If these expire or are revoked the workflows will fail + at the token-generation step with a clear error — no silent failure. From 267fd1a39b74df4240dc9fd36fbabbcf2c94efde Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:42:35 +0000 Subject: [PATCH 02/16] docs: sync corrected implementation plan (PLT-1735) Co-Authored-By: Claude Sonnet 4.6 --- .../plans/2026-06-18-golden-vector-parity.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/metamorphic/plans/2026-06-18-golden-vector-parity.md b/docs/metamorphic/plans/2026-06-18-golden-vector-parity.md index 42d1010..b8ba9ef 100644 --- a/docs/metamorphic/plans/2026-06-18-golden-vector-parity.md +++ b/docs/metamorphic/plans/2026-06-18-golden-vector-parity.md @@ -64,6 +64,9 @@ version = "0.3.0" # ← bumped from 0.1.0 serde = { version = "1.0.228", features = ["derive"] } # ← add features = ["derive"] arrow-ipc = { version = "57.0.0" } # ← new base64 = "0.22" # ← new +hex = "0.4.3" # ← move from [dev-dependencies]; binary needs it + +# In [dev-dependencies], remove the hex line — it is now covered by [dependencies] above. # … existing [[bin]] for uniffi-bindgen unchanged … @@ -77,7 +80,7 @@ name = "emit_golden_metadata" # ← new cargo build 2>&1 | head -20 ``` -Expected: no errors (warnings about unused imports are fine at this stage). +Expected: no errors. - [ ] **Step 3: Commit** @@ -115,17 +118,14 @@ git commit -m "chore: bump to v0.3.0; add arrow-ipc, base64, serde derive" #![expect(clippy::unwrap_used, reason = "CLI tool — panics are acceptable")] use std::collections::HashMap; -use std::io::Cursor; -use std::sync::Arc; -use arrow::array::RecordBatch; use arrow_ipc::writer::StreamWriter; use arrow_schema::{DataType, Field, Schema}; use base64::Engine as _; -use serde::{Deserialize, Serialize}; +use serde::Serialize; use starfix::{ArrowDigester, HasherConfig}; -#[derive(Serialize, Deserialize)] +#[derive(Serialize)] struct GoldenFixture { version: String, generated_by: String, @@ -133,7 +133,7 @@ struct GoldenFixture { vectors: Vec, } -#[derive(Serialize, Deserialize)] +#[derive(Serialize)] struct GoldenVector { id: String, description: String, From 18bac84954121d3e924b91b54e4802804fe2c5eb Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Fri, 19 Jun 2026 03:03:18 +0000 Subject: [PATCH 03/16] feat: add golden fixture tests/golden/include_metadata_v0.3.json (PLT-1735) --- tests/golden/include_metadata_v0.3.json | 79 +++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 tests/golden/include_metadata_v0.3.json diff --git a/tests/golden/include_metadata_v0.3.json b/tests/golden/include_metadata_v0.3.json new file mode 100644 index 0000000..43e975d --- /dev/null +++ b/tests/golden/include_metadata_v0.3.json @@ -0,0 +1,79 @@ +{ + "version": "0.3", + "generated_by": "cargo run --bin emit_golden_metadata", + "rust_commit": "c652de91e2163738da20cca424a5ffc68f16f53f", + "vectors": [ + { + "id": "no_metadata_include_false", + "description": "Schema {id: Int64, name: LargeUtf8}, no metadata, include_metadata=false", + "method": "hash_schema", + "include_metadata": false, + "ipc_b64": "/////7gAAAAQAAAAAAAKAAwACgAJAAQACgAAABAAAAAAAQQACAAIAAAABAAIAAAABAAAAAIAAABUAAAAFAAAABAAFAAQAA4ADwAEAAAACAAQAAAAGAAAAAwAAAAAAAEUEAAAAAAAAAAEAAQABAAAAAQAAABuYW1lAAAAABAAFAAQAAAADwAEAAAACAAQAAAAGAAAACAAAAAAAAACHAAAAAgADAAEAAsACAAAAEAAAAAAAAABAAAAAAIAAABpZAAA/////wAAAAA=", + "expected_hash": "00000131f10e388ffb366939a37d877568320d681eb2c6e81793259274afd3b10ae38d" + }, + { + "id": "schema_level_metadata", + "description": "Schema-level metadata {version: 2}, include_metadata=true", + "method": "hash_schema", + "include_metadata": true, + "ipc_b64": "/////7gAAAAQAAAAAAAKAA4ADAALAAQACgAAABQAAAAAAAABBAAKAAwAAAAIAAQACgAAAAgAAAA0AAAAAQAAAAwAAAAIAAwACAAEAAgAAAAIAAAADAAAAAEAAAAyAAAABwAAAHZlcnNpb24AAQAAABQAAAAQABQAEAAAAA8ABAAAAAgAEAAAABgAAAAgAAAAAAAAAhwAAAAIAAwABAALAAgAAABAAAAAAAAAAQAAAAACAAAAaWQAAAAAAAAAAAAA/////wAAAAA=", + "expected_hash": "00000185971192458c11308853f8647b688810b7efd3afb5e1a3811ebf1c449969b18d" + }, + { + "id": "field_metadata_single_field", + "description": "Single field x: Int32 with metadata {unit: kg}, include_metadata=true", + "method": "hash_schema", + "include_metadata": true, + "ipc_b64": "/////7gAAAAQAAAAAAAKAAwACgAJAAQACgAAABAAAAAAAQQACAAIAAAABAAIAAAABAAAAAEAAAAYAAAAAAASABgAFAAAABMACAAAAAwABAASAAAANAAAABgAAAAgAAAAAAAAAhwAAAAIAAwABAALAAgAAAAgAAAAAAAAAQAAAAABAAAAeAAAAAEAAAAMAAAACAAMAAgABAAIAAAACAAAAAwAAAACAAAAa2cAAAQAAAB1bml0AAAAAAAAAAAAAAAA/////wAAAAA=", + "expected_hash": "000001d1490be52114b6a5284a89e91522b111f3e6ee1534edf2e472956f07360df020" + }, + { + "id": "field_metadata_multiple_fields", + "description": "Two fields x:{unit:kg}, y:{unit:m}, include_metadata=true", + "method": "hash_schema", + "include_metadata": true, + "ipc_b64": "/////zgBAAAQAAAAAAAKAAwACgAJAAQACgAAABAAAAAAAQQACAAIAAAABAAIAAAABAAAAAIAAACIAAAAGAAAAAAAEgAaABQAAAATAAgAAAAMAAQAEgAAADAAAAAYAAAAHAAAAAAAAAMYAAAAAAAGAAgABgAGAAAAAAACAAAAAAABAAAAeQAAAAEAAAAEAAAAkP///wgAAAAMAAAAAQAAAG0AAAAEAAAAdW5pdAAAEgAYABQAAAATAAgAAAAMAAQAEgAAADQAAAAYAAAAIAAAAAAAAAIcAAAACAAMAAQACwAIAAAAIAAAAAAAAAEAAAAAAQAAAHgAAAABAAAADAAAAAgADAAIAAQACAAAAAgAAAAMAAAAAgAAAGtnAAAEAAAAdW5pdAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD/////AAAAAA==", + "expected_hash": "000001fdaa74e60eec18b816ac8e732f943fe405aaa9465ad3b788c083322b0bf16757" + }, + { + "id": "schema_and_field_metadata", + "description": "Schema metadata {version:1} + field metadata {unit:kg}, include_metadata=true", + "method": "hash_schema", + "include_metadata": true, + "ipc_b64": "//////gAAAAQAAAAAAAKAA4ADAALAAQACgAAABQAAAAAAAABBAAKAAwAAAAIAAQACgAAAAgAAAAsAAAAAQAAAAQAAACE////CAAAAAwAAAABAAAAMQAAAAcAAAB2ZXJzaW9uAAEAAAAYAAAAAAASABgAFAAAABMACAAAAAwABAASAAAANAAAABgAAAAgAAAAAAAAAhwAAAAIAAwABAALAAgAAAAgAAAAAAAAAQAAAAABAAAAeAAAAAEAAAAMAAAACAAMAAgABAAIAAAACAAAAAwAAAACAAAAa2cAAAQAAAB1bml0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP////8AAAAA", + "expected_hash": "000001dd9181cde304d1e39d424283c46d0b63126fa12e1272ce92ca34d7d4389dec65" + }, + { + "id": "unicode_metadata", + "description": "Field metadata with emoji and CJK keys/values, include_metadata=true", + "method": "hash_schema", + "include_metadata": true, + "ipc_b64": "//////gAAAAQAAAAAAAKAAwACgAJAAQACgAAABAAAAAAAQQACAAIAAAABAAIAAAABAAAAAEAAAAYAAAAAAASABgAFAAAABMACAAAAAwABAASAAAALAAAABgAAAAMAAAAAAAAFBAAAAAAAAAABAAEAAQAAAAEAAAAZGF0YQAAAAACAAAAOAAAAAQAAADY////CAAAABQAAAAJAAAA5pel5pys6KqeAAAABgAAAOS4reaWhwAACAAMAAgABAAIAAAACAAAABQAAAAJAAAAdmFsdWVf4pyTAAAADgAAAGVtb2ppX2tleV/wn5SRAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP////8AAAAA", + "expected_hash": "00000155498f11ff0803127d60f8c6daa18b8c8dbb3411d2962e08ee0dfbce7054706b" + }, + { + "id": "key_reorder_canonical", + "description": "Field metadata {alpha,beta,gamma} inserted in alphabetical order, include_metadata=true", + "method": "hash_schema", + "include_metadata": true, + "ipc_b64": "//////gAAAAQAAAAAAAKAAwACgAJAAQACgAAABAAAAAAAQQACAAIAAAABAAIAAAABAAAAAEAAAAYAAAAAAASABgAFAAAABMACAAAAAwABAASAAAANAAAABgAAAAgAAAAAAAAAhwAAAAIAAwABAALAAgAAAAgAAAAAAAAAQAAAAABAAAAeAAAAAMAAABUAAAAKAAAAAQAAADA////CAAAAAwAAAABAAAAMwAAAAUAAABnYW1tYQAAAOD///8IAAAADAAAAAEAAAAyAAAABAAAAGJldGEAAAAACAAMAAgABAAIAAAACAAAAAwAAAABAAAAMQAAAAUAAABhbHBoYQAAAP////8AAAAA", + "expected_hash": "0000016a68d96d5c89d9b5de01d104c7ad1f51f9956bd8758691c9241ecacf32c701bf" + }, + { + "id": "key_reorder_shuffled", + "description": "Same metadata {alpha,beta,gamma} inserted in shuffled order — expected_hash must equal key_reorder_canonical", + "method": "hash_schema", + "include_metadata": true, + "ipc_b64": "//////gAAAAQAAAAAAAKAAwACgAJAAQACgAAABAAAAAAAQQACAAIAAAABAAIAAAABAAAAAEAAAAYAAAAAAASABgAFAAAABMACAAAAAwABAASAAAANAAAABgAAAAgAAAAAAAAAhwAAAAIAAwABAALAAgAAAAgAAAAAAAAAQAAAAABAAAAeAAAAAMAAABUAAAAKAAAAAQAAADA////CAAAAAwAAAABAAAAMwAAAAUAAABnYW1tYQAAAOD///8IAAAADAAAAAEAAAAyAAAABAAAAGJldGEAAAAACAAMAAgABAAIAAAACAAAAAwAAAABAAAAMQAAAAUAAABhbHBoYQAAAP////8AAAAA", + "expected_hash": "0000016a68d96d5c89d9b5de01d104c7ad1f51f9956bd8758691c9241ecacf32c701bf" + }, + { + "id": "empty_metadata_invariant", + "description": "Schema with no metadata — hash(include_metadata=false) must equal hash(include_metadata=true)", + "method": "hash_schema", + "include_metadata": false, + "ipc_b64": "/////7gAAAAQAAAAAAAKAAwACgAJAAQACgAAABAAAAAAAQQACAAIAAAABAAIAAAABAAAAAIAAABUAAAAFAAAABAAFAAQAA4ADwAEAAAACAAQAAAAGAAAAAwAAAAAAAEUEAAAAAAAAAAEAAQABAAAAAQAAABuYW1lAAAAABAAFAAQAAAADwAEAAAACAAQAAAAGAAAACAAAAAAAAACHAAAAAgADAAEAAsACAAAAEAAAAAAAAABAAAAAAIAAABpZAAA/////wAAAAA=", + "expected_hash": "00000131f10e388ffb366939a37d877568320d681eb2c6e81793259274afd3b10ae38d" + } + ] +} From b797287a2236d899b54535ec071e2d0253a198b6 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Fri, 19 Jun 2026 03:06:32 +0000 Subject: [PATCH 04/16] test: add cross-language golden parity tests for include_metadata (PLT-1735) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds tests/test_golden_parity_metadata.py with 11 tests (9 parametrized + 2 invariant checks) driven by the Rust-authoritative golden fixture include_metadata_v0.3.json. Also fixes a unicode parity bug in _update_metadata_hash: json.dumps was escaping non-ASCII characters (e.g. emoji as surrogate pairs \ud83d\udd11, CJK as \u4e2d\u6587) whereas Rust serde_json emits literal UTF-8 — adding ensure_ascii=False restores byte-for-byte parity. Co-Authored-By: Claude Sonnet 4.6 --- src/starfix/arrow_digester.py | 2 +- tests/test_golden_parity_metadata.py | 121 +++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 tests/test_golden_parity_metadata.py diff --git a/src/starfix/arrow_digester.py b/src/starfix/arrow_digester.py index f1cf607..8e8ab4b 100644 --- a/src/starfix/arrow_digester.py +++ b/src/starfix/arrow_digester.py @@ -288,7 +288,7 @@ def _update_metadata_hash(hasher: _Hasher, schema: pa.Schema) -> None: meta_doc["schema"] = _sort_metadata(schema.metadata) if meta_doc: - hasher.update(json.dumps(meta_doc, separators=(",", ":")).encode()) + hasher.update(json.dumps(meta_doc, separators=(",", ":"), ensure_ascii=False).encode()) def _hash_schema(schema: pa.Schema, include_metadata: bool = False) -> bytes: diff --git a/tests/test_golden_parity_metadata.py b/tests/test_golden_parity_metadata.py new file mode 100644 index 0000000..fc5bd31 --- /dev/null +++ b/tests/test_golden_parity_metadata.py @@ -0,0 +1,121 @@ +"""Cross-language golden-hash parity tests for include_metadata vectors (PLT-1735). + +Every expected hash in `tests/golden/include_metadata_v0.3.json` was generated by +`cargo run --bin emit_golden_metadata` in the Rust starfix crate and is treated as +the authoritative source of truth. These tests will fail immediately if the Python +implementation diverges from Rust. + +Covers 9 vectors: + - no_metadata_include_false (regression: same as v0.1.0 schema hash) + - schema_level_metadata (schema-level metadata, include_metadata=True) + - field_metadata_single_field (single field with metadata, include_metadata=True) + - field_metadata_multiple_fields (two fields with metadata, include_metadata=True) + - schema_and_field_metadata (both schema- and field-level, include_metadata=True) + - unicode_metadata (emoji + CJK keys/values, include_metadata=True) + - key_reorder_canonical (metadata keys in alphabetical order) + - key_reorder_shuffled (same keys, different insertion order — same hash) + - empty_metadata_invariant (no metadata — hash must be same for both flag values) +""" +from __future__ import annotations + +import base64 +import io +import json +from pathlib import Path +from typing import Any + +import pyarrow as pa +import pytest + +from starfix.arrow_digester import ArrowDigester + +_FIXTURE_PATH = Path(__file__).parent / "golden" / "include_metadata_v0.3.json" + + +def _load_vectors() -> list[dict[str, Any]]: + with _FIXTURE_PATH.open() as f: + return json.load(f)["vectors"] + + +def _get_vector(vector_id: str) -> dict[str, Any]: + for v in _load_vectors(): + if v["id"] == vector_id: + return v + raise KeyError(f"Vector '{vector_id}' not found in fixture") + + +def _deserialize_ipc(ipc_b64: str) -> tuple[pa.Schema, pa.RecordBatch | None]: + raw = base64.b64decode(ipc_b64) + reader = pa.ipc.open_stream(io.BytesIO(raw)) + schema = reader.schema + batches = list(reader) + batch = batches[0] if batches else None + return schema, batch + + +@pytest.mark.parametrize("vector", _load_vectors(), ids=lambda v: v["id"]) +def test_golden_vector(vector: dict[str, Any]) -> None: + """Each vector must produce the Rust-authoritative expected_hash.""" + schema, batch = _deserialize_ipc(vector["ipc_b64"]) + include_metadata: bool = vector["include_metadata"] + + if vector["method"] == "hash_schema": + result = ArrowDigester.hash_schema(schema, include_metadata=include_metadata) + elif vector["method"] == "hash_record_batch": + assert batch is not None, ( + f"Vector '{vector['id']}': method is hash_record_batch but IPC has no batch" + ) + result = ArrowDigester.hash_record_batch(batch, include_metadata=include_metadata) + else: + pytest.fail(f"Vector '{vector['id']}': unknown method '{vector['method']}'") + + assert result.hex() == vector["expected_hash"], ( + f"Vector '{vector['id']}' mismatch: {vector['description']}\n" + f" got: {result.hex()}\n" + f" expected: {vector['expected_hash']}" + ) + + +def test_empty_metadata_invariant_both_flags() -> None: + """Empty-metadata invariant: include_metadata=True must produce the same hash as False.""" + vector = _get_vector("empty_metadata_invariant") + schema, _ = _deserialize_ipc(vector["ipc_b64"]) + + hash_false = ArrowDigester.hash_schema(schema, include_metadata=False).hex() + hash_true = ArrowDigester.hash_schema(schema, include_metadata=True).hex() + + assert hash_false == vector["expected_hash"], ( + f"empty_metadata_invariant: include_metadata=False got {hash_false}, " + f"expected {vector['expected_hash']}" + ) + assert hash_true == vector["expected_hash"], ( + f"empty_metadata_invariant: include_metadata=True got {hash_true}, " + f"expected {vector['expected_hash']}" + ) + assert hash_false == hash_true, ( + "empty_metadata_invariant: both flag values must produce the same hash" + ) + + +def test_key_reorder_hashes_are_identical() -> None: + """key_reorder_canonical and key_reorder_shuffled must share the same expected_hash.""" + canonical = _get_vector("key_reorder_canonical") + shuffled = _get_vector("key_reorder_shuffled") + + schema_c, _ = _deserialize_ipc(canonical["ipc_b64"]) + schema_s, _ = _deserialize_ipc(shuffled["ipc_b64"]) + + hash_c = ArrowDigester.hash_schema(schema_c, include_metadata=True).hex() + hash_s = ArrowDigester.hash_schema(schema_s, include_metadata=True).hex() + + assert hash_c == canonical["expected_hash"], ( + f"key_reorder_canonical live hash drifted from fixture" + ) + assert hash_s == shuffled["expected_hash"], ( + f"key_reorder_shuffled live hash drifted from fixture" + ) + assert hash_c == hash_s, ( + "key_reorder_canonical and key_reorder_shuffled must have identical hashes\n" + f" canonical: {hash_c}\n" + f" shuffled: {hash_s}" + ) From c0384a138f2f30574515c70facb256f72ccaeda6 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Fri, 19 Jun 2026 03:08:56 +0000 Subject: [PATCH 05/16] fix: add ensure_ascii=False to _serialized_schema for non-ASCII field name parity --- src/starfix/arrow_digester.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/starfix/arrow_digester.py b/src/starfix/arrow_digester.py index 8e8ab4b..cfca8ff 100644 --- a/src/starfix/arrow_digester.py +++ b/src/starfix/arrow_digester.py @@ -197,7 +197,7 @@ def _serialized_schema(schema: pa.Schema) -> str: fields[field.name] = _sort_json_value(value) # Sort by field name (BTreeMap ordering) sorted_fields = OrderedDict(sorted(fields.items())) - return json.dumps(sorted_fields, separators=(",", ":")) + return json.dumps(sorted_fields, separators=(",", ":"), ensure_ascii=False) # --------------------------------------------------------------------------- From 02b8981706c4ba7e2a519ac490bf82706b62ee78 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Fri, 19 Jun 2026 03:09:44 +0000 Subject: [PATCH 06/16] ci: add golden-sync-check drift gate (PLT-1735) --- .github/workflows/ci.yml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ee8db13..d5d2810 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -60,3 +60,31 @@ jobs: LGPL-2.0-only, LGPL-2.0-or-later, LGPL-2.1-only, LGPL-2.1-or-later, LGPL-3.0-only, LGPL-3.0-or-later + + golden-sync-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Generate GitHub App token + id: app-token + uses: actions/create-github-app-token@v3 + with: + app-id: ${{ secrets.STARFIX_APP_ID }} + private-key: ${{ secrets.STARFIX_APP_PRIVATE_KEY }} + repositories: starfix + + - name: Fetch authoritative fixture from starfix + run: | + gh api repos/nauticalab/starfix/contents/tests/golden/include_metadata_v0.3.json \ + --jq '.content' | base64 -d > /tmp/upstream.json + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + + - name: Fail on fixture drift + run: | + if ! diff tests/golden/include_metadata_v0.3.json /tmp/upstream.json; then + echo "ERROR: tests/golden/include_metadata_v0.3.json has drifted from nauticalab/starfix main." + echo "Copy the updated fixture from the starfix repo and commit it." + exit 1 + fi From f0797807dffa83d4a2cf542ab15e77e32a2f6011 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Fri, 19 Jun 2026 03:09:49 +0000 Subject: [PATCH 07/16] ci: add manual release workflow (PLT-1735) --- .github/workflows/release.yml | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 .github/workflows/release.yml diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..f172563 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,35 @@ +name: release + +on: + workflow_dispatch: + inputs: + version: + description: 'Release version (e.g. 0.3.0)' + required: true + type: string + +jobs: + release: + runs-on: ubuntu-latest + steps: + - name: Generate GitHub App token + id: app-token + uses: actions/create-github-app-token@v3 + with: + app-id: ${{ secrets.RELEASE_APP_ID }} + private-key: ${{ secrets.RELEASE_APP_PRIVATE_KEY }} + + - uses: actions/checkout@v4 + with: + token: ${{ steps.app-token.outputs.token }} + fetch-depth: 0 + + - name: Configure git + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Tag and push release + run: | + git tag v${{ inputs.version }} + git push origin v${{ inputs.version }} From 89158792db8e5df39c33b716c73c1f80ded30b24 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Fri, 19 Jun 2026 03:48:02 +0000 Subject: [PATCH 08/16] fix: update fixture description, add defensive error check to golden-sync-check --- .github/workflows/ci.yml | 6 ++++++ tests/golden/include_metadata_v0.3.json | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d5d2810..dae653b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -83,6 +83,12 @@ jobs: - name: Fail on fixture drift run: | + if [ ! -s /tmp/upstream.json ]; then + echo "ERROR: Could not fetch fixture from nauticalab/starfix main." + echo "Ensure nauticalab/starfix#main has tests/golden/include_metadata_v0.3.json" + echo "(starfix PLT-1735 PR must be merged before this check can pass)" + exit 1 + fi if ! diff tests/golden/include_metadata_v0.3.json /tmp/upstream.json; then echo "ERROR: tests/golden/include_metadata_v0.3.json has drifted from nauticalab/starfix main." echo "Copy the updated fixture from the starfix repo and commit it." diff --git a/tests/golden/include_metadata_v0.3.json b/tests/golden/include_metadata_v0.3.json index 43e975d..871c3b8 100644 --- a/tests/golden/include_metadata_v0.3.json +++ b/tests/golden/include_metadata_v0.3.json @@ -1,7 +1,7 @@ { "version": "0.3", "generated_by": "cargo run --bin emit_golden_metadata", - "rust_commit": "c652de91e2163738da20cca424a5ffc68f16f53f", + "rust_commit": "8eae026401c23f9c3807755d353fb26242a908a1", "vectors": [ { "id": "no_metadata_include_false", @@ -61,7 +61,7 @@ }, { "id": "key_reorder_shuffled", - "description": "Same metadata {alpha,beta,gamma} inserted in shuffled order — expected_hash must equal key_reorder_canonical", + "description": "Same metadata {alpha,beta,gamma} with shuffled HashMap input — arrow-ipc normalises key order before FlatBuffers encoding, so expected_hash equals key_reorder_canonical", "method": "hash_schema", "include_metadata": true, "ipc_b64": "//////gAAAAQAAAAAAAKAAwACgAJAAQACgAAABAAAAAAAQQACAAIAAAABAAIAAAABAAAAAEAAAAYAAAAAAASABgAFAAAABMACAAAAAwABAASAAAANAAAABgAAAAgAAAAAAAAAhwAAAAIAAwABAALAAgAAAAgAAAAAAAAAQAAAAABAAAAeAAAAAMAAABUAAAAKAAAAAQAAADA////CAAAAAwAAAABAAAAMwAAAAUAAABnYW1tYQAAAOD///8IAAAADAAAAAEAAAAyAAAABAAAAGJldGEAAAAACAAMAAgABAAIAAAACAAAAAwAAAABAAAAMQAAAAUAAABhbHBoYQAAAP////8AAAAA", From d57d90a836dade663d00ce6bb28da7db96817d73 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Fri, 19 Jun 2026 06:34:14 +0000 Subject: [PATCH 09/16] =?UTF-8?q?fix:=20address=20PR=20review=20=E2=80=94?= =?UTF-8?q?=20pin=20actions=20to=20full=20SHAs,=20add=20permissions=20bloc?= =?UTF-8?q?ks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Pin all GitHub Actions to full commit SHAs in ci.yml and release.yml - Add top-level permissions: contents: read to ci.yml - Add job-level permissions: contents: write to release job in release.yml - golden-sync-check job: explicit permissions: contents: read Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/ci.yml | 21 +++++++++++++-------- .github/workflows/release.yml | 7 ++++--- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dae653b..992cea3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,6 +6,9 @@ on: pull_request: branches: [main] +permissions: + contents: read + jobs: test: runs-on: ubuntu-latest @@ -13,10 +16,10 @@ jobs: matrix: python-version: ["3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - name: Install uv - uses: astral-sh/setup-uv@v5 + uses: astral-sh/setup-uv@e58605a9b6da7c637471fab8847a5e5a6b8df081 # v5 - name: Set up Python ${{ matrix.python-version }} run: uv python install ${{ matrix.python-version }} @@ -30,10 +33,10 @@ jobs: license-check: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - name: Install uv - uses: astral-sh/setup-uv@v5 + uses: astral-sh/setup-uv@e58605a9b6da7c637471fab8847a5e5a6b8df081 # v5 - name: Install dependencies run: uv sync --dev @@ -48,10 +51,10 @@ jobs: runs-on: ubuntu-latest if: github.event_name == 'pull_request' steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - name: Dependency review - uses: actions/dependency-review-action@v4 + uses: actions/dependency-review-action@2031cfc080254a8a887f58cffee85186f0e49e48 # v4 with: deny-licenses: >- GPL-2.0-only, GPL-2.0-or-later, @@ -63,12 +66,14 @@ jobs: golden-sync-check: runs-on: ubuntu-latest + permissions: + contents: read steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - name: Generate GitHub App token id: app-token - uses: actions/create-github-app-token@v3 + uses: actions/create-github-app-token@bcd2ba49218906704ab6c1aa796996da409d3eb1 # v3 with: app-id: ${{ secrets.STARFIX_APP_ID }} private-key: ${{ secrets.STARFIX_APP_PRIVATE_KEY }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index f172563..4a551ba 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -11,18 +11,19 @@ on: jobs: release: runs-on: ubuntu-latest + permissions: + contents: write steps: - name: Generate GitHub App token id: app-token - uses: actions/create-github-app-token@v3 + uses: actions/create-github-app-token@bcd2ba49218906704ab6c1aa796996da409d3eb1 # v3 with: app-id: ${{ secrets.RELEASE_APP_ID }} private-key: ${{ secrets.RELEASE_APP_PRIVATE_KEY }} - - uses: actions/checkout@v4 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 with: token: ${{ steps.app-token.outputs.token }} - fetch-depth: 0 - name: Configure git run: | From 633db5f6db1a552a05ee85bd1cae29b1d3253368 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Sat, 20 Jun 2026 00:59:28 +0000 Subject: [PATCH 10/16] =?UTF-8?q?fix:=20address=20Copilot=20review=20?= =?UTF-8?q?=E2=80=94=20spec=20accuracy,=20fork=20guard,=20version=20normal?= =?UTF-8?q?isation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Spec doc: correct three stale inaccuracies about arrow-ipc metadata key ordering (same corrections already applied to the Rust repo's copy): IPC intro no longer claims insertion order is preserved; key_reorder section now says byte-identical IPC blobs; risk section corrected to say arrow-ipc sorts keys alphabetically, not preserves insertion order - golden-sync-check: add job-level if guard so the job is skipped for PRs from forks (secrets not available to forked workflows) - release.yml: strip leading 'v' from the version input before tagging to prevent 'vv0.3.0' tags when operator includes the prefix Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/ci.yml | 5 ++++ .github/workflows/release.yml | 8 ++++-- .../2026-06-18-golden-vector-parity-design.md | 26 ++++++++++++------- 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 992cea3..c70d779 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -66,6 +66,11 @@ jobs: golden-sync-check: runs-on: ubuntu-latest + # Secrets are not available to fork PRs, so skip the check there. + # Internal PRs and pushes to main always run it. + if: > + github.event_name != 'pull_request' || + github.event.pull_request.head.repo.full_name == github.repository permissions: contents: read steps: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4a551ba..c5db02f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -32,5 +32,9 @@ jobs: - name: Tag and push release run: | - git tag v${{ inputs.version }} - git push origin v${{ inputs.version }} + # Strip a leading 'v' if the operator included one (e.g. "v0.3.0" → "0.3.0"), + # then always prefix with 'v' so the tag is exactly "v0.3.0". + VERSION="${{ inputs.version }}" + VERSION="${VERSION#v}" + git tag "v${VERSION}" + git push origin "v${VERSION}" diff --git a/docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md b/docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md index 5ecd1bc..9abdec1 100644 --- a/docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md +++ b/docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md @@ -46,9 +46,11 @@ Each entry in `vectors`: | `ipc_b64` | string | Base64-encoded Arrow IPC stream (schema + optional rows) | | `expected_hash` | string | Rust-authoritative hex-encoded hash digest | -Arrow IPC is used for `ipc_b64` because it captures the exact bytes — including metadata key -insertion order — that were fed to the Rust hasher. This eliminates any risk of Python -constructing subtly different Arrow data. +Arrow IPC is used for `ipc_b64` because it provides a stable, self-contained encoding of the +Arrow schema (including all metadata) that both Rust and Python can deserialize identically. +Note: `arrow-ipc`'s `metadata_to_fb` sorts metadata keys alphabetically before FlatBuffers +encoding (`ordered_keys.sort()` in convert.rs), so the IPC byte stream is deterministic +regardless of HashMap insertion order at the producer side. --- @@ -67,8 +69,12 @@ constructing subtly different Arrow data. | `empty_metadata_invariant` | No metadata at all — tested with `include_metadata=false`; `expected_hash` must equal that of the same schema hashed with `include_metadata=true` | `false` | The `key_reorder_canonical` / `key_reorder_shuffled` pair encodes the key-ordering determinism -invariant directly in the fixture: two different IPC blobs (different insertion orders) map to -the same `expected_hash`. +invariant directly in the fixture. Because `arrow-ipc` sorts metadata keys alphabetically before +FlatBuffers encoding, both vectors produce **byte-identical IPC blobs** — the insertion-order +invariant is enforced at the IPC level, not the hasher level. Both vectors therefore share the +same `ipc_b64` and the same `expected_hash`. The test verifies that the hasher also produces +matching output when the live hasher is called directly on schemas built with different insertion +orders. The `empty_metadata_invariant` entry pins the empty-metadata fixed point: a schema with no metadata must produce the same hash regardless of `include_metadata`. Only one entry is needed @@ -374,10 +380,12 @@ Tag push fires the existing `publish.yml` → pure-Python package published to P ## Risks -- **IPC metadata order:** Arrow IPC preserves key insertion order in its FlatBuffers encoding. - This is load-bearing for the `key_reorder_*` vectors. If a future Arrow version changes this - behaviour the vectors would need to be regenerated, but the fixture format itself remains - valid. +- **IPC metadata order:** Arrow IPC does **not** preserve key insertion order — `metadata_to_fb` + in arrow-ipc sorts keys alphabetically before FlatBuffers encoding (`ordered_keys.sort()` in + convert.rs). As a result, the `key_reorder_canonical` and `key_reorder_shuffled` vectors + produce byte-identical IPC blobs. If a future Arrow version changes this sorting behaviour, + the `key_reorder_*` IPC blobs would diverge and the vectors would need to be regenerated; + the fixture format itself remains valid. - **Fixture drift:** Mitigated by the `golden-sync-check` CI job. If the GitHub App secret expires or is revoked, the drift check will fail loudly rather than silently passing. - **Version/tag sync:** The `verify-version-tag-sync` CI job enforces the invariant on every From e09e0a057c9e9cd73d52da97daf1daf149906821 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Sat, 20 Jun 2026 01:16:25 +0000 Subject: [PATCH 11/16] =?UTF-8?q?fix:=20correct=20release=20architecture?= =?UTF-8?q?=20in=20spec=20=E2=80=94=20crates.io=20for=20Rust,=20no=20matur?= =?UTF-8?q?in?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The spec described the wrong release flow: maturin building Python wheels from the Rust repo and publishing them to PyPI. The correct architecture: - starfix (Rust) → crates.io via cargo-release (publish = true) - starfix-python (Python) → PyPI via publish.yml (unchanged) Update the spec to reflect: - release.toml: publish = true (not false) - CI job is in ci.yml (not maturin-release.yml) - Release workflow passes CARGO_REGISTRY_TOKEN - End-to-end description: cargo-release publishes crate to crates.io; tag push fires ci.yml (tests + version/tag sync), not wheel builds Co-Authored-By: Claude Sonnet 4.6 --- .../2026-06-18-golden-vector-parity-design.md | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md b/docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md index 9abdec1..bda7f56 100644 --- a/docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md +++ b/docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md @@ -239,10 +239,10 @@ pushes both — making it structurally impossible to tag without also updating ` pre-release-commit-message = "chore: release v{{version}}" tag-name = "v{{version}}" push = true -publish = false # wheels go via maturin, not crates.io +publish = true # cargo-release publishes the crate to crates.io ``` -### `cargo-release` CI enforcement (new job in `maturin-release.yml`) +### `cargo-release` CI enforcement (`verify-version-tag-sync` job in `ci.yml`) A lightweight check that runs on every tag push and fails if the tag name does not match the version in `Cargo.toml`: @@ -250,10 +250,11 @@ the version in `Cargo.toml`: ```yaml verify-version-tag-sync: runs-on: ubuntu-latest - if: startsWith(github.ref, 'refs/tags/') steps: - - uses: actions/checkout@v4 + - if: startsWith(github.ref, 'refs/tags/') + uses: actions/checkout@v4 - name: Verify Cargo.toml version matches tag + if: startsWith(github.ref, 'refs/tags/') run: | TAG="${GITHUB_REF#refs/tags/v}" CARGO_VERSION=$(grep '^version' Cargo.toml | head -1 | sed 's/.*= *"\(.*\)"/\1/') @@ -267,6 +268,9 @@ verify-version-tag-sync: #### `nauticalab/starfix` — new `.github/workflows/release.yml` +`cargo-release` publishes the **Rust crate** to crates.io. The Python package `starfix` +is a separate project and is released exclusively through `nauticalab/starfix-python`. + ```yaml name: release on: @@ -305,18 +309,20 @@ jobs: - name: Release run: cargo release ${{ inputs.version }} --execute --no-confirm + env: + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} ``` A GitHub App token (rather than `GITHUB_TOKEN`) is required for checkout so that the tag -push from `cargo-release` triggers the downstream `maturin-release.yml` workflow. +push from `cargo-release` triggers the downstream `ci.yml` workflow. `GITHUB_TOKEN`-pushed events do not trigger other workflows (GitHub's recursion guard). Required secrets: `RELEASE_APP_ID`, `RELEASE_APP_PRIVATE_KEY` — a GitHub App with -`contents:write` on `nauticalab/starfix`. +`contents:write` on `nauticalab/starfix`; `CARGO_REGISTRY_TOKEN` — a crates.io API token. **What this workflow does end-to-end:** -1. `cargo-release` bumps `Cargo.toml` → commits → creates `v{version}` tag → pushes both -2. Tag push fires `maturin-release.yml` → builds wheels → publishes to PyPI +1. `cargo-release` bumps `Cargo.toml` → commits → publishes crate to crates.io → creates `v{version}` tag → pushes both +2. Tag push fires `ci.yml` → runs tests and verifies version/tag sync #### `nauticalab/starfix-python` — new `.github/workflows/release.yml` From f4e9f223a6634a47274cbfa8c70535c6fce96720 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Sat, 20 Jun 2026 01:18:46 +0000 Subject: [PATCH 12/16] =?UTF-8?q?fix:=20update=20spec=20=E2=80=94=20crates?= =?UTF-8?q?.io=20uses=20trusted=20publishing,=20no=20API=20token=20needed?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- .../specs/2026-06-18-golden-vector-parity-design.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md b/docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md index bda7f56..490ec3f 100644 --- a/docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md +++ b/docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md @@ -309,16 +309,18 @@ jobs: - name: Release run: cargo release ${{ inputs.version }} --execute --no-confirm - env: - CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} ``` A GitHub App token (rather than `GITHUB_TOKEN`) is required for checkout so that the tag push from `cargo-release` triggers the downstream `ci.yml` workflow. `GITHUB_TOKEN`-pushed events do not trigger other workflows (GitHub's recursion guard). +crates.io publishing uses **trusted publishing** (OIDC) — no API token secret is needed. +The job requires `id-token: write` permission so GitHub Actions can mint the OIDC token +that crates.io accepts. + Required secrets: `RELEASE_APP_ID`, `RELEASE_APP_PRIVATE_KEY` — a GitHub App with -`contents:write` on `nauticalab/starfix`; `CARGO_REGISTRY_TOKEN` — a crates.io API token. +`contents:write` on `nauticalab/starfix`. **What this workflow does end-to-end:** 1. `cargo-release` bumps `Cargo.toml` → commits → publishes crate to crates.io → creates `v{version}` tag → pushes both From 0c3b7c92e1ae618c2b4f78e1b7db70456cefeb72 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Sat, 20 Jun 2026 21:07:17 +0000 Subject: [PATCH 13/16] fix: assert key_reorder fixture invariant; add pull-requests: read to dependency-review test_key_reorder_hashes_are_identical now explicitly asserts the fixture invariant (ipc_b64 and expected_hash are identical between the two vectors, because arrow-ipc normalises key order alphabetically before encoding) then hashes once as a live sanity check. Re-hashing both vectors would just hash the same bytes twice with no added signal beyond test_golden_vector. Also adds `pull-requests: read` to the dependency-review job's permissions block; actions/dependency-review-action requires it to fetch PR diff metadata and would fail at runtime with only `contents: read`. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/ci.yml | 3 +++ tests/test_golden_parity_metadata.py | 36 ++++++++++++++++------------ 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c70d779..c69e51a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -50,6 +50,9 @@ jobs: dependency-review: runs-on: ubuntu-latest if: github.event_name == 'pull_request' + permissions: + contents: read + pull-requests: read # required by actions/dependency-review-action to fetch PR diff steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 diff --git a/tests/test_golden_parity_metadata.py b/tests/test_golden_parity_metadata.py index fc5bd31..900544b 100644 --- a/tests/test_golden_parity_metadata.py +++ b/tests/test_golden_parity_metadata.py @@ -98,24 +98,30 @@ def test_empty_metadata_invariant_both_flags() -> None: def test_key_reorder_hashes_are_identical() -> None: - """key_reorder_canonical and key_reorder_shuffled must share the same expected_hash.""" + """key_reorder_canonical and key_reorder_shuffled encode to identical IPC bytes. + + arrow-ipc normalises metadata key order (alphabetical sort) before FlatBuffers + encoding, so both vectors produce byte-identical blobs regardless of insertion + order. This test asserts those fixture invariants explicitly, then hashes once + as a live sanity check — re-hashing both would just hash the same bytes twice. + """ canonical = _get_vector("key_reorder_canonical") shuffled = _get_vector("key_reorder_shuffled") - schema_c, _ = _deserialize_ipc(canonical["ipc_b64"]) - schema_s, _ = _deserialize_ipc(shuffled["ipc_b64"]) - - hash_c = ArrowDigester.hash_schema(schema_c, include_metadata=True).hex() - hash_s = ArrowDigester.hash_schema(schema_s, include_metadata=True).hex() - - assert hash_c == canonical["expected_hash"], ( - f"key_reorder_canonical live hash drifted from fixture" + # Fixture invariants: arrow-ipc sorts keys before encoding, so both entries + # must carry the same IPC blob and the same expected hash. + assert canonical["ipc_b64"] == shuffled["ipc_b64"], ( + "key_reorder_canonical and key_reorder_shuffled must have identical ipc_b64 " + "(arrow-ipc normalises metadata key order before FlatBuffers encoding)" ) - assert hash_s == shuffled["expected_hash"], ( - f"key_reorder_shuffled live hash drifted from fixture" + assert canonical["expected_hash"] == shuffled["expected_hash"], ( + "key_reorder_canonical and key_reorder_shuffled must have identical expected_hash" ) - assert hash_c == hash_s, ( - "key_reorder_canonical and key_reorder_shuffled must have identical hashes\n" - f" canonical: {hash_c}\n" - f" shuffled: {hash_s}" + + # Sanity check: live hash must match the (single) fixture value. + schema, _ = _deserialize_ipc(canonical["ipc_b64"]) + live_hash = ArrowDigester.hash_schema(schema, include_metadata=True).hex() + assert live_hash == canonical["expected_hash"], ( + f"key_reorder live hash drifted from fixture: " + f"got {live_hash}, expected {canonical['expected_hash']}" ) From e0e918da0f44188ae2b74574bd366e14cdc0fa7d Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Sun, 21 Jun 2026 02:07:54 +0000 Subject: [PATCH 14/16] fix: open fixture with explicit utf-8 encoding Path.open() without encoding falls back to the platform default, which can be cp1252 on Windows. The fixture contains UTF-8 JSON (unicode metadata keys/values in the unicode_metadata vector), so pin it to utf-8 to avoid decode errors on non-UTF-8 platforms. Co-Authored-By: Claude Sonnet 4.6 --- tests/test_golden_parity_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_golden_parity_metadata.py b/tests/test_golden_parity_metadata.py index 900544b..008a1d9 100644 --- a/tests/test_golden_parity_metadata.py +++ b/tests/test_golden_parity_metadata.py @@ -33,7 +33,7 @@ def _load_vectors() -> list[dict[str, Any]]: - with _FIXTURE_PATH.open() as f: + with _FIXTURE_PATH.open(encoding="utf-8") as f: return json.load(f)["vectors"] From 9ce01f633e34132aae358b9278d2b00ab3902a9f Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Sun, 21 Jun 2026 02:25:08 +0000 Subject: [PATCH 15/16] refactor: merge publish.yml into release.yml; pin all action SHAs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The two-workflow chain (release.yml tags → publish.yml publishes on tag push) existed solely because GITHUB_TOKEN-authenticated pushes don't trigger other workflow runs. Now that they are a single workflow, the recursion guard is irrelevant — GITHUB_TOKEN with contents: write is enough to push the tag, so the GitHub App token and RELEASE_APP_* secrets are no longer needed. Merged structure (test → build → publish-testpypi → publish-pypi): - test: matrix 3.10/3.11/3.12, no fetch-depth needed - build: normalizes version once (job output), pushes tag, builds with hatch-vcs (fetch-depth: 0), uploads dist artifact - publish-testpypi: OIDC publish to TestPyPI - publish-pypi: OIDC publish to PyPI + GitHub Release (tag_name from build job output) All actions pinned to full commit SHAs: - actions/checkout 34e114876b0b11c390a56381ad16ebd13914f8d5 (v4) - astral-sh/setup-uv e58605a9b6da7c637471fab8847a5e5a6b8df081 (v5) - actions/upload-artifact ea165f8d65b6e75b540449e92b4886f43607fa02 (v4.6.2) - actions/download-artifact d3f86a106a0bac45b974a628896c90dbdf5c8093 (v4.3.0) - softprops/action-gh-release 3bb12739c298aeb8a4eeaf626c5b8d85266b0e65 (v2.6.2) Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/publish.yml | 104 ---------------------------- .github/workflows/release.yml | 124 +++++++++++++++++++++++++++++----- 2 files changed, 108 insertions(+), 120 deletions(-) delete mode 100644 .github/workflows/publish.yml diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml deleted file mode 100644 index 9563c40..0000000 --- a/.github/workflows/publish.yml +++ /dev/null @@ -1,104 +0,0 @@ -name: Publish to PyPI - -on: - push: - tags: - - "v[0-9]+.[0-9]+.[0-9]+" - -jobs: - test: - name: Test (Python ${{ matrix.python-version }}) - runs-on: ubuntu-latest - strategy: - fail-fast: true - matrix: - python-version: ["3.10", "3.11", "3.12"] - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 # required: hatch-vcs needs full tag history - - - name: Install uv - uses: astral-sh/setup-uv@v5 - - - name: Set up Python ${{ matrix.python-version }} - run: uv python install ${{ matrix.python-version }} - - - name: Install dependencies - run: uv sync --dev --python ${{ matrix.python-version }} - - - name: Run tests - run: uv run --python ${{ matrix.python-version }} pytest tests/ -v - - build: - name: Build distribution - needs: test - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 # required: hatch-vcs needs full tag history - - - name: Install uv - uses: astral-sh/setup-uv@v5 - - - name: Build wheel and sdist - run: uv build - - - name: Upload dist artifact - uses: actions/upload-artifact@v4 - with: - name: dist - path: dist/ - if-no-files-found: error - - publish-testpypi: - name: Publish → TestPyPI - needs: build - runs-on: ubuntu-latest - environment: - name: testpypi - url: https://test.pypi.org/p/starfix - permissions: - id-token: write # required for OIDC Trusted Publishing - steps: - - name: Install uv - uses: astral-sh/setup-uv@v5 - - - name: Download dist artifact - uses: actions/download-artifact@v4 - with: - name: dist - path: dist/ - - - name: Publish to TestPyPI - run: uv publish --publish-url https://test.pypi.org/legacy/ dist/* - - publish-pypi: - name: Publish → PyPI - needs: publish-testpypi - runs-on: ubuntu-latest - environment: - name: pypi - url: https://pypi.org/p/starfix - permissions: - id-token: write # required for OIDC Trusted Publishing - contents: write # required for creating GitHub Release - steps: - - name: Install uv - uses: astral-sh/setup-uv@v5 - - - name: Download dist artifact - uses: actions/download-artifact@v4 - with: - name: dist - path: dist/ - - - name: Publish to PyPI - run: uv publish dist/* - - - name: Create GitHub Release - uses: softprops/action-gh-release@v2 - with: - generate_release_notes: true - files: dist/* diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index c5db02f..19bd9b8 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -8,33 +8,125 @@ on: required: true type: string +# Minimal default; individual jobs declare only what they need. +permissions: + contents: read + jobs: - release: + test: + name: Test (Python ${{ matrix.python-version }}) + runs-on: ubuntu-latest + strategy: + fail-fast: true + matrix: + python-version: ["3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + + - name: Install uv + uses: astral-sh/setup-uv@e58605a9b6da7c637471fab8847a5e5a6b8df081 # v5 + + - name: Set up Python ${{ matrix.python-version }} + run: uv python install ${{ matrix.python-version }} + + - name: Install dependencies + run: uv sync --dev --python ${{ matrix.python-version }} + + - name: Run tests + run: uv run --python ${{ matrix.python-version }} pytest tests/ -v + + build: + name: Build distribution + needs: test runs-on: ubuntu-latest permissions: - contents: write + contents: write # required to push the release tag + outputs: + version: ${{ steps.normalize.outputs.version }} steps: - - name: Generate GitHub App token - id: app-token - uses: actions/create-github-app-token@bcd2ba49218906704ab6c1aa796996da409d3eb1 # v3 - with: - app-id: ${{ secrets.RELEASE_APP_ID }} - private-key: ${{ secrets.RELEASE_APP_PRIVATE_KEY }} + # Strip a leading 'v' once here; all downstream steps use the output. + - name: Normalize version + id: normalize + run: | + VERSION="${{ inputs.version }}" + VERSION="${VERSION#v}" + echo "version=${VERSION}" >> "${GITHUB_OUTPUT}" - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 with: - token: ${{ steps.app-token.outputs.token }} + fetch-depth: 0 # required: hatch-vcs derives the package version from git tags - name: Configure git run: | git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" - - name: Tag and push release + - name: Tag release run: | - # Strip a leading 'v' if the operator included one (e.g. "v0.3.0" → "0.3.0"), - # then always prefix with 'v' so the tag is exactly "v0.3.0". - VERSION="${{ inputs.version }}" - VERSION="${VERSION#v}" - git tag "v${VERSION}" - git push origin "v${VERSION}" + git tag "v${{ steps.normalize.outputs.version }}" + git push origin "v${{ steps.normalize.outputs.version }}" + + - name: Install uv + uses: astral-sh/setup-uv@e58605a9b6da7c637471fab8847a5e5a6b8df081 # v5 + + - name: Build wheel and sdist + run: uv build + + - name: Upload dist artifact + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: dist + path: dist/ + if-no-files-found: error + + publish-testpypi: + name: Publish → TestPyPI + needs: build + runs-on: ubuntu-latest + environment: + name: testpypi + url: https://test.pypi.org/p/starfix + permissions: + id-token: write # required for OIDC trusted publishing + steps: + - name: Install uv + uses: astral-sh/setup-uv@e58605a9b6da7c637471fab8847a5e5a6b8df081 # v5 + + - name: Download dist artifact + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 + with: + name: dist + path: dist/ + + - name: Publish to TestPyPI + run: uv publish --publish-url https://test.pypi.org/legacy/ dist/* + + publish-pypi: + name: Publish → PyPI + needs: [build, publish-testpypi] + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/starfix + permissions: + id-token: write # required for OIDC trusted publishing + contents: write # required for creating the GitHub Release + steps: + - name: Install uv + uses: astral-sh/setup-uv@e58605a9b6da7c637471fab8847a5e5a6b8df081 # v5 + + - name: Download dist artifact + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 + with: + name: dist + path: dist/ + + - name: Publish to PyPI + run: uv publish dist/* + + - name: Create GitHub Release + uses: softprops/action-gh-release@3bb12739c298aeb8a4eeaf626c5b8d85266b0e65 # v2.6.2 + with: + tag_name: "v${{ needs.build.outputs.version }}" + generate_release_notes: true + files: dist/* From f256d6c49d7c83e62b68db852d2001deaca70995 Mon Sep 17 00:00:00 2001 From: "agent-kurodo[bot]" <268466204+agent-kurodo[bot]@users.noreply.github.com> Date: Sun, 21 Jun 2026 02:37:49 +0000 Subject: [PATCH 16/16] fix: defer tag push until after build; update docs for merged release workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit release.yml: split "Tag release" into two steps — create the tag locally first (so hatch-vcs can derive the version during uv build), then push to origin only after a successful build. The push step also fails fast with a clear error if the tag already exists on origin (e.g. workflow re-run for the same version). Adds a prominent comment on publish-testpypi noting that PyPI/TestPyPI Trusted Publisher configs must reference release.yml (not the former publish.yml). spec/plan docs: update all references to the old two-workflow model (tag-only release.yml + tag-triggered publish.yml, GitHub App token) to reflect the merged release.yml that runs tests, builds, publishes to TestPyPI→PyPI, and creates the GitHub Release using GITHUB_TOKEN only. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/release.yml | 26 ++++++++-- .../plans/2026-06-18-golden-vector-parity.md | 14 ++++-- .../2026-06-18-golden-vector-parity-design.md | 47 +++++-------------- 3 files changed, 43 insertions(+), 44 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 19bd9b8..fd9f97b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -61,10 +61,11 @@ jobs: git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" - - name: Tag release - run: | - git tag "v${{ steps.normalize.outputs.version }}" - git push origin "v${{ steps.normalize.outputs.version }}" + # Create the tag locally so hatch-vcs can derive the package version during + # `uv build`. The tag is pushed to origin only AFTER a successful build to + # avoid leaving a dangling remote tag if the build fails. + - name: Create local release tag + run: git tag "v${{ steps.normalize.outputs.version }}" - name: Install uv uses: astral-sh/setup-uv@e58605a9b6da7c637471fab8847a5e5a6b8df081 # v5 @@ -72,6 +73,18 @@ jobs: - name: Build wheel and sdist run: uv build + # Fail fast if the tag already exists on origin (e.g. re-running the workflow + # for the same version) before attempting to push, to surface a clear error + # rather than a confusing "already exists" git message. + - name: Push release tag + run: | + if git ls-remote --exit-code --tags origin \ + "refs/tags/v${{ steps.normalize.outputs.version }}" > /dev/null 2>&1; then + echo "ERROR: tag v${{ steps.normalize.outputs.version }} already exists on origin" >&2 + exit 1 + fi + git push origin "v${{ steps.normalize.outputs.version }}" + - name: Upload dist artifact uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: @@ -79,6 +92,11 @@ jobs: path: dist/ if-no-files-found: error + # IMPORTANT: The PyPI and TestPyPI Trusted Publisher configurations must reference + # this workflow file (`.github/workflows/release.yml`). If they still point to the + # former `publish.yml`, OIDC token exchange will be rejected by PyPI/TestPyPI. + # Update both configs at https://pypi.org and https://test.pypi.org before running + # this workflow for the first time. publish-testpypi: name: Publish → TestPyPI needs: build diff --git a/docs/metamorphic/plans/2026-06-18-golden-vector-parity.md b/docs/metamorphic/plans/2026-06-18-golden-vector-parity.md index b8ba9ef..aa12026 100644 --- a/docs/metamorphic/plans/2026-06-18-golden-vector-parity.md +++ b/docs/metamorphic/plans/2026-06-18-golden-vector-parity.md @@ -1092,9 +1092,15 @@ jobs: git push origin v${{ inputs.version }} ``` -Note: the tag push triggers `publish.yml` which runs tests, builds the wheel and sdist via `uv build`, and publishes to TestPyPI then PyPI using OIDC Trusted Publishing. +Note: the merged `release.yml` handles the full pipeline — tests (matrix), build, +TestPyPI → PyPI publish (OIDC Trusted Publishing), and GitHub Release creation. No +GitHub App token is required; `GITHUB_TOKEN` with `contents: write` handles the tag +push. The PyPI/TestPyPI Trusted Publisher configuration must reference `release.yml` +(not the former `publish.yml`). -Required secrets: `RELEASE_APP_ID`, `RELEASE_APP_PRIVATE_KEY` (GitHub App with `contents:write` on `nauticalab/starfix-python`). +Required secrets: `STARFIX_APP_ID` + `STARFIX_APP_PRIVATE_KEY` (for `golden-sync-check` +in `ci.yml`). The `RELEASE_APP_ID` / `RELEASE_APP_PRIVATE_KEY` secrets are no longer +needed by `release.yml`. - [ ] **Step 2: Commit** @@ -1124,9 +1130,9 @@ gh pr create \ - Adds `tests/golden/include_metadata_v0.3.json` (copied from authoritative Rust fixture) - Adds `tests/test_golden_parity_metadata.py`: 9 parametrized cross-language parity tests + 2 invariant tests - Adds `golden-sync-check` CI job to `ci.yml` — fetches fixture from `nauticalab/starfix` main on every PR and fails on drift -- Adds manual `release.yml` workflow using GitHub App token +- Adds manual `release.yml` workflow: tests → build → TestPyPI → PyPI → GitHub Release (OIDC, no API token secrets needed) -Requires `STARFIX_APP_ID` + `STARFIX_APP_PRIVATE_KEY` secrets (for drift check) and `RELEASE_APP_ID` + `RELEASE_APP_PRIVATE_KEY` secrets (for release workflow) to be set on the repo. +Requires `STARFIX_APP_ID` + `STARFIX_APP_PRIVATE_KEY` secrets (for `golden-sync-check` drift check) to be set on the repo. Also requires the PyPI/TestPyPI Trusted Publisher configs to reference `.github/workflows/release.yml`. Part of PLT-1735. Companion PR: nauticalab/starfix (Rust side). diff --git a/docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md b/docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md index 490ec3f..c2663ef 100644 --- a/docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md +++ b/docs/metamorphic/specs/2026-06-18-golden-vector-parity-design.md @@ -326,47 +326,22 @@ Required secrets: `RELEASE_APP_ID`, `RELEASE_APP_PRIVATE_KEY` — a GitHub App w 1. `cargo-release` bumps `Cargo.toml` → commits → publishes crate to crates.io → creates `v{version}` tag → pushes both 2. Tag push fires `ci.yml` → runs tests and verifies version/tag sync -#### `nauticalab/starfix-python` — new `.github/workflows/release.yml` +#### `nauticalab/starfix-python` — `.github/workflows/release.yml` `hatch-vcs` reads the version from git tags automatically; there is no version file to -bump. The release workflow only needs to create and push the tag: +bump. The release workflow handles the full pipeline: tests, build, TestPyPI → PyPI +publish, and GitHub Release creation. No GitHub App token is needed — `GITHUB_TOKEN` +with `contents: write` is sufficient for the tag push. -```yaml -name: release -on: - workflow_dispatch: - inputs: - version: - description: 'Release version (e.g. 0.3.0)' - required: true - type: string -jobs: - release: - runs-on: ubuntu-latest - steps: - - name: Generate GitHub App token - id: app-token - uses: actions/create-github-app-token@v3 - with: - app-id: ${{ secrets.RELEASE_APP_ID }} - private-key: ${{ secrets.RELEASE_APP_PRIVATE_KEY }} - - - uses: actions/checkout@v4 - with: - token: ${{ steps.app-token.outputs.token }} +The tag is created locally first (so `hatch-vcs` can derive the version during `uv build`), +then pushed to origin only after a successful build to avoid leaving a dangling remote tag +on build failure. - - name: Configure git - run: | - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - - - name: Tag and push release - run: | - git tag v${{ inputs.version }} - git push origin v${{ inputs.version }} -``` +Job sequence: `test` (matrix 3.10/3.11/3.12) → `build` (local tag + build + push tag) → +`publish-testpypi` → `publish-pypi` (PyPI publish + GitHub Release). -Tag push fires the existing `publish.yml` → pure-Python package published to PyPI. +> **Note:** The PyPI and TestPyPI Trusted Publisher configurations must reference +> `.github/workflows/release.yml`. Update both configs before triggering the first release. ### Release procedure (coordinated across both repos)