From d8b10b80ae1669ac78f759d59d4cd39dac9821c7 Mon Sep 17 00:00:00 2001 From: Connor McDonald Date: Mon, 29 Jun 2026 10:41:49 +0200 Subject: [PATCH] feat(hash): full bound/free identifier split, v2 recipe (#77) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v1 alpha-renamed every identifier, so re-pointing a span at a different single-occurrence external symbol was byte-identical and passed the gate silently while the claim's prose became false. v2 now alpha-renames only *bound* names (the symbol's own name, params, locals, loop/range/comprehension vars, with/catch aliases, generic params, destructuring binders) and emits every *free* identifier verbatim, so swapping a member, call target, type, enum/const, object key, or decorator is loud even when it occurs once, while consistent local renames stay quiet. This completes v2 into the recipe the original design intended: it subsumes the member-access-only first cut and the Python decorator special case (#8). 0.7.0 is unreleased, so v2 is redefined in place rather than adding a v3; v1 stays byte-frozen (golden fixtures confirm released stamps are safe). Binding detection is tree-sitter-only and fail-closed: a position not positively recognized as a binding defaults to free. The one accepted approximation (match-arm pattern identifiers are left free) is documented and pinned. A new in-tree differential harness gates the change — 13 benign renames with zero regressions, 12 semantic free-swaps with 100% v2 catch / 0% v1 — alongside re-pinned golden digests, the version-table governance in docs/reference/hash-recipes.md, and dogfood claims in hubs/hash.md. Co-Authored-By: Claude Opus 4.8 --- AGENTS.md | 2 +- CHANGELOG.md | 22 +- docs/reference/hash-recipes.md | 106 ++++++-- docs/reference/how-it-works.md | 16 +- hubs/anchor.md | 2 +- hubs/cli-check.md | 6 +- hubs/cli-for.md | 4 +- hubs/cli-git.md | 8 +- hubs/cli-lint.md | 6 +- hubs/cli-reference.md | 2 +- hubs/cli-scaffold.md | 4 +- hubs/cli-stats.md | 4 +- hubs/cli-suggest.md | 2 +- hubs/cli-verify.md | 2 +- hubs/cli-workspace.md | 4 +- hubs/config.md | 2 +- hubs/hash.md | 44 ++-- hubs/hub-format.md | 4 +- hubs/lang.md | 2 +- hubs/rename.md | 2 +- hubs/resolve.md | 6 +- surf-core/src/hash.rs | 322 ++++++++++++++++++++---- surf-core/tests/differential_hash.rs | 353 +++++++++++++++++++++++++++ surf-core/tests/golden_hash.rs | 58 +++-- 24 files changed, 823 insertions(+), 160 deletions(-) create mode 100644 surf-core/tests/differential_hash.rs diff --git a/AGENTS.md b/AGENTS.md index 4e5cb36..ef57df0 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -5,7 +5,7 @@ anchors: surf lint blocks when AGENTS.md carries a surf:hubs block that does not link the configured hubs directory, or when that directory does not exist; without the block it stays silent. at: surf-cli/src/lint.rs > lint_agents_pointer - hash: 2:9a5f7d9fd0db + hash: 2:ac139b65f5f0 refs: [] --- diff --git a/CHANGELOG.md b/CHANGELOG.md index cd02928..abf4167 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,13 +21,21 @@ project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). json` is unchanged. - **`surf new` scaffold** ships a prose-first template (`## How it works` / `## Boundary` headings and a multi-anchor example claim) so a fresh hub is shaped like an onboarding doc. -- **Hash recipe v2 (member-access names verbatim).** The canonical hash now keeps the - property/field component of a member-access expression verbatim instead of alpha-renaming it, - so re-pointing an anchored span at a *different* external symbol — `PointsTier.TIER_1` → - `TIER_2`, `b.Del` → `b.Keep`, `ProbeColor.RED` → `GREEN` — changes the hash even when the name - occurs once. Previously these passed the gate silently while the claim's prose became false - (#140, the member-access slice of #77). Consistent local/parameter renames stay quiet, as - before. Covers TypeScript, Go, Rust, and Python. +- **Hash recipe v2 — the bound/free split (#77).** The canonical hash now alpha-renames only + *bound* identifiers (a symbol's own name, parameters, locals, loop/range/comprehension + variables, `with`/`catch` aliases, generic params, destructuring binders) and emits every + *free* identifier verbatim (external members, call targets, types, enum/constant references, + object keys, decorators). Re-pointing an anchored span at a *different* symbol is now loud even + when the name occurs once — `PointsTier.TIER_1` → `TIER_2`, `getHighest` → `getLowest`, a bare + `helper(x)` → `other(x)`, a parameter type `Foo` → `Bar`, an object key `{ alpha }` → + `{ beta }` — where before it passed the gate silently while the claim's prose became false. + Consistent local/parameter renames stay quiet, as before. This subsumes the earlier + member-access-only first cut and the Python decorator special case (#8). A new in-tree + differential harness (`surf-core/tests/differential_hash.rs`) gates the change: zero + benign-rename regressions, 100% catch on the semantic free-swap corpus across all four + languages. Binding detection is tree-sitter-only and fail-closed; the one accepted + approximation (match-arm pattern identifiers are treated as free) is documented in + [Hash recipes](docs/reference/hash-recipes.md). Covers TypeScript, Go, Rust, and Python. - **Versioned stamps.** Stored hashes now carry their recipe: a v2 stamp is prefixed `2:`, a bare 12-hex stamp is an implicit v1. `surf check` verifies each stamp under its own recipe, so existing v1 stamps keep passing (with a one-line nudge) until `surf verify` re-stamps them as diff --git a/docs/reference/hash-recipes.md b/docs/reference/hash-recipes.md index 5eaafed..83e2edf 100644 --- a/docs/reference/hash-recipes.md +++ b/docs/reference/hash-recipes.md @@ -57,17 +57,51 @@ Walk the resolved span's syntax tree into tokens: SHA-256 of the token stream, truncated to 12 hex. -**Known blind spot (#77):** because *every* identifier is alpha-renamed, re-pointing a span at a -different single-occurrence external symbol (`PointsTier.TIER_1` → `TIER_2`, `b.Del` → `b.Keep`) -yields a byte-identical stream — the claim's prose silently becomes false while the gate stays -green. - -### v2 — member-access names verbatim (surf ≥ 0.7.0; `2:` prefix) +**Known blind spot (#77, closed by v2):** because *every* identifier is alpha-renamed, re-pointing a +span at a different single-occurrence external symbol (`PointsTier.TIER_1` → `TIER_2`, `b.Del` → +`b.Keep`) yields a byte-identical stream — the claim's prose silently becomes false while the gate +stays green. This is exactly what the v2 bound/free split fixes. + +### v2 — the bound/free split (surf ≥ 0.7.0; `2:` prefix) + +v1 alpha-renames *every* identifier, which is its blind spot: an identifier occurring once maps to +the same placeholder no matter what it names, so re-pointing a span at a *different* single-occurrence +external symbol is byte-identical and silently passes. + +v2 fixes this by splitting identifiers into **bound** and **free**: + +- **Bound** — names *declared inside the hashed span*: the symbol's own name, parameters, locals, + loop/range/comprehension variables, `with`/`catch` aliases, generic parameters, and destructuring + binders. These are **alpha-renamed** exactly as in v1, so a consistent local rename still hashes + identically — rename tolerance (§6.1) is preserved. +- **Free** — everything else: external members, call targets, types, enum/constant references, + object/destructuring keys, decorator names, JSX tags. These are emitted **verbatim** (`kind:text`), + so re-pointing at a different symbol is loud *even when the name occurs once*. + +This closes the #77 class in general, not just for member accesses: `PointsTier.TIER_1` → `TIER_2`, +`getHighest` → `getLowest`, a bare `helper(x)` → `other(x)`, a parameter type `Foo` → `Bar`, and an +object key `{ alpha }` → `{ beta }` all now change the hash. It also **subsumes** the two special +cases the older design carried — a decorator name (#8) and a member-access name (the #140 first cut) +are simply free identifiers now; no dedicated branch is needed for either. (The member-access +positions keep one dedicated check so they stay verbatim even when their text collides with a bound +local — `x` the parameter vs `obj.x` the field — since that position can never *be* the binding.) + +Binding detection is tree-sitter-only — there is no scope analysis — so it is **fail-closed**: +a position not positively recognized as a binding defaults to *free* (verbatim). The two error +directions are not symmetric: misclassifying bound→free is a *visible* false positive (a benign +rename trips the gate, a human sees it); free→bound is the *invisible* miss this whole recipe exists +to prevent. So when in doubt, free wins. + +**Binding positions, per family** (the tables `surf-core/src/hash.rs` `bind_here` encodes): + +| Family | Bound positions | +|---|---| +| Rust | `function_item`/`function_signature_item` name; `parameter`/`let_declaration`/`for_expression`/`let_condition` patterns; `closure_parameters`; `type_parameters` | +| TypeScript | function/method/signature names; `required_parameter`/`optional_parameter` patterns; `variable_declarator` name; `arrow_function` single param; `for_in_statement` left; `catch_clause` parameter; `type_parameters` | +| Python | `function_definition` name; `parameters`/`lambda_parameters` (default *values* excluded); `assignment`/`augmented_assignment`/`for_statement`/`for_in_clause` left; `with`/`as` targets | +| Go | function/method/`var`/`const`/type-parameter names; `parameter_declaration` names (incl. grouped `a, b int`); `short_var_declaration`/`range_clause` left | -v1, plus one rule: the **property/field component of a member-access expression** is kept verbatim -(`kind:text`) instead of alpha-renamed. These positions name an *external* member, never a local -binding, so emitting them verbatim distinguishes "re-pointed at a different symbol" (loud) from -"renamed my own local" (still quiet — rename tolerance is preserved). Per family: +**Member-access positions kept verbatim even on a bound-name collision:** | Family | Member-access position | |---|---| @@ -76,19 +110,49 @@ binding, so emitting them verbatim distinguishes "re-pointed at a different symb | Rust | `field_identifier` as the `field` of a `field_expression` | | Python | the `attribute` identifier of an `attribute` node | -Everything else is identical to v1, so v1 ≡ v2 minus this single rule — a member-access-free span -hashes the same under both. This closes the #77 blind spot for member accesses (every reported -reproduction). Re-pointing at a non-member free identifier — a bare `Enum::VARIANT` path, a renamed -imported function called by bare name — is **not** yet covered; that is the full bound/free split -tracked in [#77](https://github.com/Connorrmcd6/surface/issues/77). +**Accepted approximation (the residue).** Without scope analysis, a match-arm / pattern identifier +is indistinguishable from a unit-variant *reference* (`Some(x)` binds `x`; `None` references a +variant — same syntax). v2 leaves all such pattern identifiers **free**. Fail-closed cuts both +ways: a unit-variant swap in a match arm is *caught* (the safe direction), but renaming a match-arm +catch-all *binding* is also loud — an accepted false positive, not a bug. This is the one benign +edit class v2 does not keep silent; a future scope-aware pass could reclaim it. The limit is pinned +in `surf-core/tests/differential_hash.rs`. + +## Version table + +surf keeps an explicit table of every recipe ever shipped, so any stamp's recipe is always +identifiable and every dropped recipe errors with a remedy rather than a generic mismatch. + +| Recipe | Stamp form | Shipped | Status | Remedy if rejected | +|---|---|---|---|---| +| v1 | bare 12-hex | surf ≤ 0.6.x | **supported** (N-1) until 0.8.0 | run `surf verify` to upgrade to v2 | +| v2 | `2:` + 12-hex | surf ≥ 0.7.0 | **current** | — | +| `N:` for unknown N | `N:` + hex | a newer surf | rejected (fails closed) | upgrade surf to a build that knows recipe N | + +- **Identification never expires.** The prefix is plain data; any future surf can name the recipe of + any stamp even after the recipe's verification code is deleted. A bare hex stamp is, and always + will be, v1. +- **N-1 support, at most one legacy mode.** surf verifies the current recipe and exactly one back. + v1 compatibility ships in 0.7.0 and is **removed in 0.8.0**; after that a bare-hex stamp is a hard, + named error ("stamped by surf < 0.7 — re-stamp with `surf verify`, or check with surf 0.7.x + first"), never a silent DIVERGED. A legacy recipe is retained *only* while it is expressible as a + mode of the current code (v1 ≡ v2 with "every identifier bound" — one flag, no frozen copy). If a + future recipe cannot express its predecessor that cheaply, that is the signal to drop compat and + require stepping through an intermediate release. ## Policy (for maintainers) - **Any** change to canonical output is a new recipe number — no exceptions. An innocent-looking refactor of the tokenizer that changes one byte of output is silently a new recipe wearing an old - number, which corrupts every stamp in the wild. The golden fixtures in - `surf-core/tests/golden_hash.rs` pin each recipe's output (v1 and v2 digests for representative - symbols per language) precisely to make that break loud. -- A recipe is kept as a verification mode only while it is expressible as a flag over the current - code (v1 ≡ v2 with the member-access rule off — one branch, no frozen copy). The N-1 support - policy and the broader version-table governance are tracked in #77. + number, which corrupts every stamp in the wild. Two layers make that break loud: + - **Golden fixtures** (`surf-core/tests/golden_hash.rs`) pin each recipe's exact digest for + representative symbols per language — both v1 (frozen forever) and v2. + - **Differential harness** (`surf-core/tests/differential_hash.rs`) re-runs the v1-vs-v2 A/B on + every build: zero benign-rename regressions, 100% catch on the semantic (free-swap) corpus. Any + future recipe change reruns the same gate. +- The recipe's rules are **dogfooded**: claims in `hubs/hash.md` are anchored to the canonicalization + code itself (`emit`, `collect_bound`, `is_member_access_name`), so editing the tokenizer without + updating this contract turns surf's own gate red. +- The external git-history replay over real corpora (prometheus for Go, nansen-python-sdk for + Python, surface itself for Rust, a large public TS repo) named in #77 runs out-of-tree against + release binaries; the in-tree harness above is its always-on, deterministic counterpart. diff --git a/docs/reference/how-it-works.md b/docs/reference/how-it-works.md index a63070a..1245dd0 100644 --- a/docs/reference/how-it-works.md +++ b/docs/reference/how-it-works.md @@ -11,14 +11,14 @@ The gate runs in four steps. `Type` alone is ambiguous, `Type > method` is unique. In Python the path also resolves non-callables: module constants, type aliases, and class attributes. 2. **Canonicalize.** Walk that span's syntax tree into a token stream. Whitespace and comments - aren't in the tree, so they drop out for free; identifiers are alpha-renamed to positional - placeholders (a *consistent* rename yields the same tokens, swapping two names does not); - operators, keywords, and literal *values* are kept verbatim. Python decorators are part of the - span, and a decorator's *name* is kept verbatim — so swapping `@cache` for `@lru_cache`, or - `@staticmethod` for `@classmethod`, changes the hash. **Member-access names are kept verbatim - too** (`obj.foo`, `pkg.Bar`, `Enum.VARIANT`), so re-pointing a span at a *different* external - symbol — `PointsTier.TIER_1` → `TIER_2`, `b.Del` → `b.Keep` — changes the hash even when the - name occurs once. (This last rule is the **v2** recipe; see [Hash recipes](./hash-recipes.md).) + aren't in the tree, so they drop out for free; operators, keywords, and literal *values* are + kept verbatim. Identifiers split into two kinds: a **bound** name (the symbol's own name, + parameters, locals, loop/destructuring binders) is alpha-renamed to a positional placeholder, + so a *consistent* local rename yields the same tokens; a **free** name (external members, call + targets, types, enum/constant references, object keys, decorators) is kept verbatim, so + re-pointing a span at a *different* symbol — `PointsTier.TIER_1` → `TIER_2`, `getHighest` → + `getLowest`, `@cache` → `@lru_cache` — changes the hash even when the name occurs once. (This + bound/free split is the **v2** recipe; see [Hash recipes](./hash-recipes.md).) 3. **Hash.** SHA-256 of that stream, truncated to 12 hex. A list `at:` combines its sites into one hash, so the claim is stale if *any* listed span changes. 4. **Compare** against the stamp stored in the frontmatter (written by `surf verify`). The stamp diff --git a/hubs/anchor.md b/hubs/anchor.md index 6255817..200ec48 100644 --- a/hubs/anchor.md +++ b/hubs/anchor.md @@ -6,7 +6,7 @@ anchors: a 1-based `@N` positional suffix for genuine name collisions. Empty/zero/missing parts are typed parse errors. at: surf-core/src/anchor.rs > parse_anchor - hash: 2:0f9a4f9d406d + hash: 2:5499582e3a55 refs: [] --- diff --git a/hubs/cli-check.md b/hubs/cli-check.md index fd1fa7d..2b1de46 100644 --- a/hubs/cli-check.md +++ b/hubs/cli-check.md @@ -8,7 +8,7 @@ anchors: a mismatch → Changed; a clean match is tagged with whether the stamp was still v1. The verdict is deterministic and needs no git. at: surf-cli/src/check.rs > check_claim - hash: 2:36cbbc039ab1 + hash: 2:66e7b4149d60 - claim: > Scoping is opt-in and intersective: with neither --base nor --files every claim is checked. A claim is in scope when any of its anchored files matches each active filter — the --base @@ -17,7 +17,7 @@ anchors: records whether it ever matched an anchored file (tallied before the --base filter), so a pattern that scopes the gate to nothing is detectable after the walk. at: surf-cli/src/check.rs > Scope > includes - hash: 2:d459cc00d69b + hash: 2:64277175938c - claim: > The gate fails closed: a hub whose frontmatter won't parse yields an Unresolvable divergence (blocking the run) rather than being silently skipped, so a frontmatter typo @@ -26,7 +26,7 @@ anchors: pattern matched nothing, so a typo'd --files can't read as a clean run) and a count of clean anchors still stamped under v1, so run can nudge the one-time `surf verify` upgrade. at: surf-cli/src/check.rs > check_workspace - hash: 2:d8957ecb971d + hash: 2:4f5890aca70c refs: [] --- diff --git a/hubs/cli-for.md b/hubs/cli-for.md index dc92cf2..21da904 100644 --- a/hubs/cli-for.md +++ b/hubs/cli-for.md @@ -9,13 +9,13 @@ anchors: versioned {version, path, matches} envelope (JSON), always exiting 0 whether or not anything matched. at: surf-cli/src/for_path.rs > run - hash: 2:4ef15aadc147 + hash: 2:991c3bcc234c - claim: > find collects every claim whose anchored file equals the queried path (matched on path only — no source parse), optionally narrowed to anchors whose first segment is the given symbol. Malformed hubs are skipped rather than erroring, and results are sorted by hub then anchor. at: surf-cli/src/for_path.rs > find - hash: 2:6eb52572ab68 + hash: 2:5d4d45bdf364 refs: [] --- diff --git a/hubs/cli-git.md b/hubs/cli-git.md index b1e7b30..d34b848 100644 --- a/hubs/cli-git.md +++ b/hubs/cli-git.md @@ -11,27 +11,27 @@ anchors: - surf-cli/src/git.rs > renamed_to - surf-cli/src/git.rs > log_stream - surf-cli/src/git.rs > list_files_at - hash: 2:874f501ad8f1 + hash: 2:95e280660c73 - claim: > changed_files returns workspace-root-relative paths changed between the merge base of base..HEAD and the working tree (git diff --relative), so the set intersects workspace-relative anchors even when the workspace is a repo subdirectory; a missing merge base (shallow clone) falls back to diffing the ref directly. at: surf-cli/src/git.rs > changed_files - hash: 2:e395bff5410d + hash: 2:86115d32f1c7 - claim: > log_stream returns the whole history window in one git spawn: every reachable commit (newest first, children before parents) with its parents and its first-parent name-status diff. Merges are included with --diff-merges=first-parent so surf stats can propagate hub state through them, and --no-renames keeps a rename reading as delete+add. at: surf-cli/src/git.rs > log_stream - hash: 2:c5d2fccc872e + hash: 2:a410122a0052 - claim: > renamed_to asks git's rename detection (diff --name-status --find-renames HEAD) for the new path a file moved to, letting lint warn and verify --follow re-point instead of hard-blocking. Best-effort: a pure mv with no content match may show as delete+add and go undetected. at: surf-cli/src/git.rs > renamed_to - hash: 2:a51ff4adba72 + hash: 2:260267073598 refs: [] --- diff --git a/hubs/cli-lint.md b/hubs/cli-lint.md index bddc8c8..4f48dd5 100644 --- a/hubs/cli-lint.md +++ b/hubs/cli-lint.md @@ -7,7 +7,7 @@ anchors: as does a file that git reports has moved. Block-level findings set a non-zero exit; warnings alone keep exit 0. at: surf-cli/src/lint.rs > lint_site - hash: 2:69018813a373 + hash: 2:97f0946e74b0 - claim: > Advisory granularity guidance (§8), never blocking: lint_under_coverage flags public symbols — top-level functions and methods — in an already-anchored file that no claim @@ -16,14 +16,14 @@ anchors: uncovered symbol is reported once against the file's first anchoring hub. It runs only on files whose anchors all resolved cleanly, so coverage nags never pile onto broken anchors. at: surf-cli/src/lint.rs > lint_under_coverage - hash: 2:3ca608c27462 + hash: 2:1a94fd3c8328 - claim: > AGENTS.md enforcement is opt-in (§11.6): only when the file carries a surf:hubs marker block does lint require it to link the configured hubs directory (which must exist), blocking otherwise. It points agents at the directory to search — never enumerating individual hubs, which would push an agent to read everything. at: surf-cli/src/lint.rs > lint_agents_pointer - hash: 2:9a5f7d9fd0db + hash: 2:ac139b65f5f0 refs: [] --- diff --git a/hubs/cli-reference.md b/hubs/cli-reference.md index 1dc9483..43ba106 100644 --- a/hubs/cli-reference.md +++ b/hubs/cli-reference.md @@ -9,7 +9,7 @@ anchors: flag, or changing a default, diverges this anchor — re-read docs/reference/commands.md before sealing. at: surf-cli/src/main.rs > Command - hash: 2:0d910ff4886d + hash: 2:1af394872add refs: ["../docs/reference/commands.md"] --- diff --git a/hubs/cli-scaffold.md b/hubs/cli-scaffold.md index 9db6100..e32b78d 100644 --- a/hubs/cli-scaffold.md +++ b/hubs/cli-scaffold.md @@ -5,12 +5,12 @@ anchors: init writes surf.toml + creates hubs/ in the cwd, and is idempotent — an existing surf.toml is left untouched. at: surf-cli/src/init.rs > run - hash: 2:dd57e4e7c5d9 + hash: 2:640471b94678 - claim: > new derives the target directory from the literal prefix of the first hub glob, then writes a hub with no anchors so it is lint-clean immediately; it refuses to overwrite. at: surf-cli/src/new.rs > hub_dir - hash: 2:d921913bf7bf + hash: 2:b9bfc7ec0b86 refs: [] --- diff --git a/hubs/cli-stats.md b/hubs/cli-stats.md index 79dd1e5..10f48f9 100644 --- a/hubs/cli-stats.md +++ b/hubs/cli-stats.md @@ -6,7 +6,7 @@ anchors: always exits 0 on success and surfaces an error (non-zero) only when git history is unavailable. The metrics are advisory and never gate. at: surf-cli/src/stats.rs > run - hash: 2:7f4ab96fac92 + hash: 2:7bcce388adbb - claim: > compute reads the whole since/until window from one streamed git log and scores each non-merge commit, propagating hub claim state incrementally — a commit inherits its first @@ -19,7 +19,7 @@ anchors: and missing git history or an invalid hub glob in surf.toml is a hard error rather than a silent zero or a quietly-narrowed hub set. at: surf-cli/src/stats.rs > compute - hash: 2:73bc9fa9daac + hash: 2:1422981eb9fa refs: ["../docs/guides/stats.md"] --- diff --git a/hubs/cli-suggest.md b/hubs/cli-suggest.md index d5ea8b3..03d5915 100644 --- a/hubs/cli-suggest.md +++ b/hubs/cli-suggest.md @@ -13,7 +13,7 @@ anchors: never writes a file and never computes or stamps a hash — the author edits the claims and verifies. at: surf-cli/src/suggest.rs > run - hash: 2:6d5ea2dc7760 + hash: 2:e1710f880435 refs: [] --- diff --git a/hubs/cli-verify.md b/hubs/cli-verify.md index 4b4f87e..066f592 100644 --- a/hubs/cli-verify.md +++ b/hubs/cli-verify.md @@ -10,7 +10,7 @@ anchors: path (only when the code is otherwise unchanged under the stored recipe). Otherwise it skips with a reason. It never edits prose, only the hash/at line. at: surf-cli/src/verify.rs > plan_claim - hash: 2:cc47fe88418b + hash: 2:18df2a40dd9d refs: [] --- diff --git a/hubs/cli-workspace.md b/hubs/cli-workspace.md index 65a982d..316582c 100644 --- a/hubs/cli-workspace.md +++ b/hubs/cli-workspace.md @@ -5,12 +5,12 @@ anchors: discover walks up from a starting directory to the nearest surf.toml (like git/ruff), parses it, and returns the root + config; it errors if no marker is found in any parent. at: surf-cli/src/workspace.rs > Workspace > discover - hash: 2:f9a5e81dc046 + hash: 2:7d57c89fcc0d - claim: > hub_paths globs the config's hub patterns relative to the discovered root, sorted and deduped. at: surf-cli/src/workspace.rs > Workspace > hub_paths - hash: 2:275e1726b702 + hash: 2:c69c8264bcfd refs: [] --- diff --git a/hubs/config.md b/hubs/config.md index c4f2f90..4b5deb6 100644 --- a/hubs/config.md +++ b/hubs/config.md @@ -5,7 +5,7 @@ anchors: surf.toml parses into a Config whose hubs default to ["hubs/*.md"]; unknown keys are rejected. Filesystem discovery (walking up for the marker) lives in the CLI, not here. at: surf-core/src/config.rs > parse_config - hash: 2:7b98f22a91b6 + hash: 2:ac841079e472 refs: [] --- diff --git a/hubs/hash.md b/hubs/hash.md index e317282..69dfec0 100644 --- a/hubs/hash.md +++ b/hubs/hash.md @@ -2,26 +2,39 @@ summary: AST-canonical hashing — quiet on cosmetics, loud on logic — and per-claim combination. anchors: - claim: > - The canonical token stream drops comments, alpha-renames identifiers to positional - placeholders (consistent rename → same tokens; swapping two names → different), and - keeps operators, keywords, and literal values verbatim. Exceptions kept verbatim: a - Python decorator's name (so `@cache` → `@lru_cache` is caught), and — under the v2 - recipe — a member-access name (the property/field of `obj.foo`/`pkg.Bar`), so - re-pointing at a different external symbol is caught even when it occurs once. The + The canonical token stream drops comments and keeps operators, keywords, and literal + values verbatim. Under v1 every identifier is alpha-renamed to a positional placeholder; + under v2 (the bound/free split) only bound identifiers are alpha-renamed and free + identifiers are emitted verbatim, so a consistent local rename stays quiet while + re-pointing at a different external symbol is loud even when it occurs once. A + member-access name is kept verbatim even when its text collides with a bound local. The per-claim ignore_literals option drops string-literal content so a copy edit doesn't re-open the gate. at: surf-core/src/hash.rs > emit - hash: 2:1a93c8f4b8d9 + hash: 2:ac52f23c70c8 + - claim: > + Under v2 only names bound inside the span are alpha-renamed — the symbol's own name, + parameters, locals, loop/range/comprehension variables, with/catch aliases, generic + params, and destructuring binders. Detection is tree-sitter-only and fail-closed: a + position not positively recognized as a binding defaults to free (verbatim). + at: surf-core/src/hash.rs > collect_bound + hash: 2:20fd6172cf43 + - claim: > + The property/field component of a member-access expression is kept verbatim even when its + text collides with a bound local, since that position can never be the binding — matched + structurally per family (kind + parent kind + the parent's named field). + at: surf-core/src/hash.rs > is_member_access_name + hash: 2:de12739eeb09 - claim: > Identifier node kinds are enumerated per language family; only these are alpha-renamed, everything else (operators, keywords, literals) is kept. at: surf-core/src/hash.rs > is_identifier - hash: 2:ac8c69676a07 + hash: 2:25ca2f219009 - claim: > A claim's hash is the combination of its per-site hashes — a single site is the identity, multiple sites combine order-sensitively, so the claim is stale if any listed span changes. at: surf-core/src/hash.rs > combine_site_hashes - hash: 2:a81ab78387c2 + hash: 2:cbbbbc3b2237 refs: [] --- @@ -33,11 +46,14 @@ gate compares; `Magnitude` alongside it is advisory and never gates. "Canonical" is what makes the gate trustworthy: comments are dropped and identifiers are alpha-renamed to positional placeholders, so a consistent rename or a reflow doesn't trip a claim, -while operators, keywords, and literal values stay verbatim, so a real logic edit does. The -exceptions exist because a name *is* the logic there — a Python decorator, and (v2) a -member-access name — so swapping one is caught even when it occurs once. A claim's hash is the -order-sensitive combination of its per-site hashes, which is what lets one multi-site claim go -stale when any of its spans changes. +while operators, keywords, and literal values stay verbatim, so a real logic edit does. Which +identifiers get alpha-renamed is the recipe's job: v1 renames them all; v2 (the bound/free split, +#77) renames only **bound** names — params, locals, the symbol's own name — and emits every +**free** identifier (external members, call targets, types, constants, decorators) verbatim, so +re-pointing a span at a different symbol is loud even when the name occurs once. A claim's hash is +the order-sensitive combination of its per-site hashes, which is what lets one multi-site claim go +stale when any of its spans changes. See [hash recipes](../docs/reference/hash-recipes.md) for the +versioned canonicalization and migration. **Boundary:** hashing decides *that* something changed, never *whether the prose is still true* — that judgment is the human's at [`surf verify`](./cli-verify.md). diff --git a/hubs/hub-format.md b/hubs/hub-format.md index 719f5d1..7f5f03c 100644 --- a/hubs/hub-format.md +++ b/hubs/hub-format.md @@ -6,12 +6,12 @@ anchors: scalar or a list, hash is optional until verified, and unknown fields are rejected — though forward-declared fields (`refs`, `covers`) are accepted and stored but inert in the verdict. at: surf-core/src/hub.rs > parse_hub - hash: 2:55be573a0ca2 + hash: 2:c510c6032ba7 - claim: > verify writes hashes back surgically: set_anchor_hash locates the Nth anchor item and replaces/inserts only its hash line, so an unchanged hash is byte-identical. at: surf-core/src/hub.rs > set_anchor_hash - hash: 2:a65d5c324dc5 + hash: 2:29805baa85ea refs: [] covers: - surf-core/src/hub.rs diff --git a/hubs/lang.md b/hubs/lang.md index 929308a..107420e 100644 --- a/hubs/lang.md +++ b/hubs/lang.md @@ -5,7 +5,7 @@ anchors: Language is detected purely by file extension (ts/tsx/mts/cts, js/jsx/mjs/cjs, rs, py/pyi, go); an unknown extension yields None and the anchor is treated as unsupported. at: surf-core/src/lang.rs > Lang > from_path - hash: 2:fabba17dc0f9 + hash: 2:e8a03d2b4a80 refs: [] --- diff --git a/hubs/rename.md b/hubs/rename.md index 8569be5..e6c6cc6 100644 --- a/hubs/rename.md +++ b/hubs/rename.md @@ -7,7 +7,7 @@ anchors: alpha-renames identifiers, a renamed-but-unchanged symbol still matches. No git, no similarity threshold. at: surf-core/src/rename.rs > find_renamed - hash: 2:8d4b88480875 + hash: 2:a6612bef104e refs: [] --- diff --git a/hubs/resolve.md b/hubs/resolve.md index 8cb448b..6ca1783 100644 --- a/hubs/resolve.md +++ b/hubs/resolve.md @@ -9,18 +9,18 @@ anchors: same-name stubs plus their implementation, in the same scope) counts as one match, so the bare name resolves without @N and the gated span covers every overload signature. at: surf-core/src/resolve.rs > resolve_nodes - hash: 2:26a42e2bfa92 + hash: 2:228dbc1dac0b - claim: > Go is resolved by a dedicated path: its symbols are flat (no nested declarations) and methods attach to a type by receiver, so `Type > Method` matches a method_declaration whose receiver type equals the type. at: surf-core/src/resolve.rs > resolve_go - hash: 2:07d730bc2bf8 + hash: 2:cba05f7f0725 - claim: > Rename detection enumerates every definition at any depth so a renamed-but-unchanged symbol can be found by hash. at: surf-core/src/resolve.rs > collect_all_defs - hash: 2:22d81a580041 + hash: 2:674b0af051a4 refs: [] --- diff --git a/surf-core/src/hash.rs b/surf-core/src/hash.rs index 42ec97b..a07bc69 100644 --- a/surf-core/src/hash.rs +++ b/surf-core/src/hash.rs @@ -1,41 +1,15 @@ //! AST-canonical hashing (§6.1) and advisory diff magnitude (§6.2). //! -//! The hash is computed over a canonical token stream of the symbol's subtree: -//! - whitespace and formatting are absent from the tree, so they are ignored for free; -//! - comments are dropped explicitly; -//! - identifiers are alpha-renamed to positional placeholders (`#0`, `#1`, …) in order of -//! first occurrence, so a *consistent* rename hashes identically while swapping two names -//! does not; -//! - operators, keywords, punctuation, and literal *values* are kept verbatim — so a -//! flipped operator (`+`→`-`), a relaxed comparison (`<`→`<=`), a deleted `await`, or a -//! changed constant all change the hash. -//! -//! The result is quiet on the changes you want ignored and loud on the ones you must catch. -//! -//! ## Recipes (versioned canonicalization) -//! -//! The canonicalization above is the **v1** recipe. **v2** (#140) adds one rule: the -//! property/field component of a member-access expression (`obj.foo`, `pkg.Bar`) is kept -//! *verbatim* rather than alpha-renamed, so re-pointing an anchored span at a different -//! external symbol (`PointsTier.TIER_1` → `TIER_2`, `b.Del` → `b.Keep`) changes the hash even -//! when the name occurs exactly once. These positions are never bindings, so emitting them -//! verbatim cannot resurface a benign local rename. v1 ≡ v2 minus that single rule — one mode -//! flag, no frozen copy of the old algorithm. -//! -//! Stored stamps carry their recipe: a v2 stamp is prefixed `2:`, a bare 12-hex stamp is -//! implicitly v1. New stamps are written under [`Recipe::CURRENT`]; `check` verifies a stamp -//! under *its own* recipe, so existing v1 stamps keep working until `surf verify` upgrades -//! them. See `docs/hash-recipes.md`. -//! -//! `Magnitude` is advisory triage metadata only. It is never compared, thresholded, or used -//! to decide pass/fail — that would defeat the whole point (§6.2). +//! The design (quiet on cosmetics, loud on logic), the v1/v2 recipes, and the bound/free split +//! live in `hubs/hash.md` and `docs/reference/hash-recipes.md` — anchored to the functions below +//! so they can't silently rot. `Magnitude` is advisory triage only; it never gates (§6.2). use crate::anchor::Anchor; use crate::lang::{Family, Lang}; use crate::resolve::{hashable_node, parse_tree, resolve_nodes, ResolveError}; use serde::Serialize; use sha2::{Digest, Sha256}; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::fmt::Write as _; use tree_sitter::Node; @@ -55,7 +29,8 @@ pub struct HashOpts { pub enum Recipe { /// Original recipe: every identifier alpha-renamed. Implicit for bare (unprefixed) stamps. V1, - /// v1 plus the member-access-name verbatim rule (#140). Stamps are prefixed `2:`. + /// The bound/free split (#77): only bound identifiers alpha-renamed, free identifiers + /// (external members, call targets, types, constants) verbatim. Stamps are prefixed `2:`. V2, } @@ -180,16 +155,22 @@ fn anchor_tokens( // A Python @overload group hashes as one token stream — stubs then impl in source order, // sharing one alpha-rename map — so a signature change in *any* overload changes the // hash (#82). The usual single-node case is unchanged. + let hashable: Vec = nodes + .into_iter() + .map(|n| hashable_node(n, family)) + .collect(); + let bound = bound_names(&hashable, family, src, recipe); let mut out = Vec::new(); let mut idents: HashMap = HashMap::new(); - for node in nodes { + for node in hashable { emit( - hashable_node(node, family), + node, src, family, opts, recipe, false, + &bound, &mut idents, &mut out, ); @@ -197,6 +178,18 @@ fn anchor_tokens( Ok(out) } +/// The names bound inside the span — the only identifiers v2 alpha-renames. Empty under v1 +/// (every identifier alpha-renamed regardless). One set is shared across an `@overload` group. +fn bound_names(nodes: &[Node], family: Family, src: &[u8], recipe: Recipe) -> HashSet { + let mut bound = HashSet::new(); + if recipe == Recipe::V2 { + for node in nodes { + collect_bound(*node, family, src, &mut bound); + } + } + bound +} + pub(crate) fn hash_node( node: Node, src: &[u8], @@ -214,15 +207,18 @@ fn canonical_tokens( opts: HashOpts, recipe: Recipe, ) -> Vec { + let node = hashable_node(node, family); + let bound = bound_names(std::slice::from_ref(&node), family, src, recipe); let mut out = Vec::new(); let mut idents: HashMap = HashMap::new(); emit( - hashable_node(node, family), + node, src, family, opts, recipe, false, + &bound, &mut idents, &mut out, ); @@ -236,11 +232,10 @@ fn emit( family: Family, opts: HashOpts, recipe: Recipe, - // True while inside a decorator's *name* (the symbol being applied), where identifiers are - // kept verbatim rather than alpha-renamed — so `@cache` → `@lru_cache` or - // `@staticmethod` → `@classmethod` is caught (§6.1, #8). Arguments to a decorator follow the - // normal rules, so reformatting them stays quiet. + // v1 only: a decorator name kept verbatim (#8). v2 treats it as a free identifier instead. decorator_name: bool, + // v2 only: names bound in the span; an identifier is alpha-renamed iff its text is in here. + bound: &HashSet, idents: &mut HashMap, out: &mut Vec, ) { @@ -252,10 +247,12 @@ fn emit( if node.is_named() { if is_identifier(kind, family) { let text = node.utf8_text(src).unwrap_or_default(); - // v2 keeps member-access names verbatim too, so `obj.foo` → `obj.bar` is loud even - // when `bar` occurs once (#140). v1 keeps only decorator names verbatim. - let verbatim = - decorator_name || (recipe == Recipe::V2 && is_member_access_name(node, family)); + // A member-access name is verbatim even when it collides with a bound local + // (`x` the param vs `obj.x` the field) — that position can never be the binding. + let verbatim = match recipe { + Recipe::V1 => decorator_name, + Recipe::V2 => is_member_access_name(node, family) || !bound.contains(text), + }; if verbatim { out.push(format!("{kind}:{text}")); } else { @@ -308,19 +305,162 @@ fn emit( opts, recipe, child_decorator_name, + bound, idents, out, ); } } -/// True for the property/field component of a member-access expression — the part the v2 -/// recipe (#140) keeps verbatim. These positions name an *external* member, never a local -/// binding, so emitting them verbatim distinguishes "re-pointed at a different symbol" from -/// "renamed my own local" without breaking rename tolerance. Each family is matched -/// structurally (kind + parent kind + the parent's named field) so an identifier that merely -/// *shares* the kind in another position (e.g. an object-literal key, a method *name*) is left -/// to the normal alpha-rename. +/// Walk the span collecting every bound name (see `hubs/hash.md`). Fail-closed: a position not +/// positively recognized as a binding by [`bind_here`] is left free. +fn collect_bound(node: Node, family: Family, src: &[u8], out: &mut HashSet) { + bind_here(node, family, src, out); + let mut cursor = node.walk(); + for child in node.children(&mut cursor) { + collect_bound(child, family, src, out); + } +} + +/// The per-family binding-position table: declaration names bind directly, pattern positions +/// via [`harvest`]. +fn bind_here(node: Node, family: Family, src: &[u8], out: &mut HashSet) { + let kind = node.kind(); + match family { + Family::Rust => match kind { + "function_item" | "function_signature_item" => bind_field_text(node, "name", src, out), + "parameter" | "let_declaration" | "for_expression" | "let_condition" => { + harvest_field(node, "pattern", src, out) + } + "closure_parameters" | "type_parameters" => harvest_children(node, src, out), + _ => {} + }, + Family::TypeScript => match kind { + "function_declaration" + | "generator_function_declaration" + | "function_signature" + | "method_definition" + | "method_signature" + | "abstract_method_signature" => bind_field_text(node, "name", src, out), + "required_parameter" | "optional_parameter" => harvest_field(node, "pattern", src, out), + "variable_declarator" => harvest_field(node, "name", src, out), + "arrow_function" => harvest_field(node, "parameter", src, out), + "for_in_statement" => harvest_field(node, "left", src, out), + "catch_clause" => harvest_field(node, "parameter", src, out), + "type_parameters" => harvest_children(node, src, out), + _ => {} + }, + Family::Python => match kind { + "function_definition" => bind_field_text(node, "name", src, out), + // A default's *value* is a free expression, so bind only the name; every other + // parameter form (plain, typed, `*args`, `**kw`) harvests cleanly. + "parameters" | "lambda_parameters" => { + let mut cursor = node.walk(); + for child in node.named_children(&mut cursor) { + match child.kind() { + "default_parameter" | "typed_default_parameter" => { + bind_field_text(child, "name", src, out) + } + _ => harvest(child, src, out), + } + } + } + "assignment" | "augmented_assignment" | "for_statement" | "for_in_clause" => { + harvest_field(node, "left", src, out) + } + "as_pattern_target" => harvest(node, src, out), + _ => {} + }, + Family::Go => match kind { + "function_declaration" + | "method_declaration" + | "var_spec" + | "const_spec" + | "type_parameter_declaration" => bind_field_text(node, "name", src, out), + "parameter_declaration" | "variadic_parameter_declaration" => { + bind_field_text(node, "name", src, out) + } + "short_var_declaration" | "range_clause" => harvest_field(node, "left", src, out), + _ => {} + }, + } +} + +/// Bind the text of every `field` child directly — bypassing [`harvest`]'s leaf filter so a +/// method name counts but a destructuring key does not. All matching fields, so `var a, b` binds +/// both. +fn bind_field_text(node: Node, field: &str, src: &[u8], out: &mut HashSet) { + let mut cursor = node.walk(); + if cursor.goto_first_child() { + loop { + if cursor.field_name() == Some(field) { + if let Ok(text) = cursor.node().utf8_text(src) { + out.insert(text.to_string()); + } + } + if !cursor.goto_next_sibling() { + break; + } + } + } +} + +/// [`harvest`] every `field` child (a pattern position). +fn harvest_field(node: Node, field: &str, src: &[u8], out: &mut HashSet) { + let mut cursor = node.walk(); + if cursor.goto_first_child() { + loop { + if cursor.field_name() == Some(field) { + harvest(cursor.node(), src, out); + } + if !cursor.goto_next_sibling() { + break; + } + } + } +} + +fn harvest_children(node: Node, src: &[u8], out: &mut HashSet) { + let mut cursor = node.walk(); + for child in node.named_children(&mut cursor) { + harvest(child, src, out); + } +} + +/// Collect binding-leaf identifiers from a pattern subtree, skipping path/member positions and +/// `type:` fields (external names, never local bindings). A destructure *key* is not a leaf kind, +/// so re-pointing it at a different source member stays loud. +fn harvest(node: Node, src: &[u8], out: &mut HashSet) { + match node.kind() { + "scoped_identifier" + | "scoped_type_identifier" + | "attribute" + | "member_expression" + | "selector_expression" + | "field_expression" => return, + "identifier" + | "type_identifier" + | "shorthand_field_identifier" + | "shorthand_property_identifier_pattern" => { + if let Ok(text) = node.utf8_text(src) { + out.insert(text.to_string()); + } + return; + } + _ => {} + } + let type_field = node.child_by_field_name("type").map(|n| n.id()); + let mut cursor = node.walk(); + for child in node.children(&mut cursor) { + if Some(child.id()) != type_field { + harvest(child, src, out); + } + } +} + +/// The property/field component of a member access (see `hubs/hash.md`). Matched structurally +/// (kind + parent kind + named field) so the same kind elsewhere (an object key, a method name) +/// isn't caught. fn is_member_access_name(node: Node, family: Family) -> bool { let Some(parent) = node.parent() else { return false; @@ -572,17 +712,91 @@ mod tests { ); } - /// An object-literal *key* is a `property_identifier` too, but not a member access — it stays - /// alpha-renamed, so renaming both a key and its sole reference consistently is quiet (the - /// structural check guards against over-firing on non-access `property_identifier`s). + /// An object-literal *key* is free (not a binding), so under the bound/free split renaming it + /// is loud — it changes the shape of the constructed object, the same class of change as a + /// member rename. v1, which alpha-renames every identifier, is blind to it. #[test] - fn object_literal_key_is_not_treated_as_member_access() { + fn object_literal_key_rename_is_v1_blind_and_v2_loud() { let a = "export function f() { const o = { alpha: 1 }; return o; }\n"; let b = "export function f() { const o = { beta: 1 }; return o; }\n"; - // Both v1 and v2 see a single identifier in that position → alpha-renamed → equal. assert_eq!( + raw(a, Lang::TypeScript, "x.ts > f", Recipe::V1), + raw(b, Lang::TypeScript, "x.ts > f", Recipe::V1), + ); + assert_ne!( raw(a, Lang::TypeScript, "x.ts > f", Recipe::V2), raw(b, Lang::TypeScript, "x.ts > f", Recipe::V2), ); } + + /// Swapping a bare single-occurrence free call target (no receiver, so not a member access) is + /// invisible to v1 but loud under the full split. + #[test] + fn bare_free_call_target_swap_is_v1_blind_and_v2_loud() { + let a = "pub fn f(x: i64) -> i64 { helper(x) }\n"; + let b = "pub fn f(x: i64) -> i64 { other(x) }\n"; + assert_eq!( + raw(a, Lang::Rust, "x.rs > f", Recipe::V1), + raw(b, Lang::Rust, "x.rs > f", Recipe::V1), + "v1 alpha-renames the call target → blind", + ); + assert_ne!( + raw(a, Lang::Rust, "x.rs > f", Recipe::V2), + raw(b, Lang::Rust, "x.rs > f", Recipe::V2), + "v2 emits the free call target verbatim → loud", + ); + } + + /// A free type reference is verbatim under v2: changing a parameter's type is a contract + /// change, caught even when the type name occurs once. A *generic* parameter, declared in the + /// span, stays bound — renaming it consistently is quiet. + #[test] + fn free_type_is_loud_generic_param_is_quiet() { + let a = "pub fn f(x: Foo) -> i64 { 0 }\n"; + let b = "pub fn f(x: Bar) -> i64 { 0 }\n"; + assert_ne!( + raw(a, Lang::Rust, "x.rs > f", Recipe::V2), + raw(b, Lang::Rust, "x.rs > f", Recipe::V2), + "swapping an external type is loud", + ); + let g1 = "pub fn f(x: T) -> T { x }\n"; + let g2 = "pub fn f(x: U) -> U { x }\n"; + assert_eq!( + raw(g1, Lang::Rust, "x.rs > f", Recipe::V2), + raw(g2, Lang::Rust, "x.rs > f", Recipe::V2), + "renaming a generic parameter consistently is quiet", + ); + } + + /// Destructuring binders are bound (renaming them is quiet), but the *source key* they read + /// from is free (re-pointing at a different member is loud) — across the pattern forms each + /// family offers. + #[test] + fn destructuring_binder_quiet_source_key_loud() { + // TS object destructuring: rename the binder `b` → quiet; change the source key → loud. + let bind_a = "export function f(o: O) { const { k: b } = o; return b; }\n"; + let bind_b = "export function f(o: O) { const { k: c } = o; return c; }\n"; + assert_eq!( + raw(bind_a, Lang::TypeScript, "x.ts > f", Recipe::V2), + raw(bind_b, Lang::TypeScript, "x.ts > f", Recipe::V2), + ); + let key_a = "export function f(o: O) { const { k: b } = o; return b; }\n"; + let key_b = "export function f(o: O) { const { j: b } = o; return b; }\n"; + assert_ne!( + raw(key_a, Lang::TypeScript, "x.ts > f", Recipe::V2), + raw(key_b, Lang::TypeScript, "x.ts > f", Recipe::V2), + ); + } + + /// A Python decorator name is a free identifier under v2, so `@cache` → `@lru_cache` is loud + /// without the dedicated decorator special case v1 needed (#8 is subsumed by the split). + #[test] + fn decorator_name_swap_is_loud_under_v2_without_special_case() { + let a = "@cache\ndef f(x):\n return x\n"; + let b = "@lru_cache\ndef f(x):\n return x\n"; + assert_ne!( + raw(a, Lang::Python, "x.py > f", Recipe::V2), + raw(b, Lang::Python, "x.py > f", Recipe::V2), + ); + } } diff --git a/surf-core/tests/differential_hash.rs b/surf-core/tests/differential_hash.rs new file mode 100644 index 0000000..4f58b92 --- /dev/null +++ b/surf-core/tests/differential_hash.rs @@ -0,0 +1,353 @@ +//! In-tree v1-vs-v2 A/B for the bound/free split (#77), kept here so any future canonicalization +//! change reruns the same gate. Two mutation classes, asserted per case below: +//! - Benign (consistent bound-name rename): v2 must stay quiet — zero regressions tolerated. +//! - Semantic (single-occurrence free-name swap): v1 blind, v2 loud — v2 catches what v1 missed. +//! +//! The external git-history replay over real corpora (#77) runs out-of-tree; this is its always-on +//! counterpart. + +use surf_core::{hash_anchor_raw, parse_anchor, HashOpts, Lang, Recipe}; + +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +enum Kind { + Benign, + Semantic, +} +use Kind::*; + +struct Case { + lang: Lang, + anchor: &'static str, + // The anchor for `after`, when a benign edit renames the symbol itself (so it resolves under + // a new path). `None` means same as `anchor`. + after_anchor: Option<&'static str>, + before: &'static str, + after: &'static str, + kind: Kind, + note: &'static str, +} + +fn raw(src: &str, lang: Lang, anchor: &str, recipe: Recipe) -> String { + hash_anchor_raw( + src, + lang, + &parse_anchor(anchor).unwrap(), + HashOpts::default(), + recipe, + ) + .unwrap() +} + +fn cases() -> Vec { + vec![ + // ---- Benign: consistent renames of BOUND names must stay quiet under v2 ------------- + Case { + lang: Lang::Rust, + anchor: "x.rs > f", + after_anchor: None, + before: "pub fn f(nxp: i64) -> i64 { let t = nxp; t + nxp }\n", + after: "pub fn f(points: i64) -> i64 { let t2 = points; t2 + points }\n", + kind: Benign, + note: "rust param + local rename", + }, + Case { + lang: Lang::Rust, + anchor: "x.rs > f", + after_anchor: None, + before: "pub fn f(xs: Vec) -> T { xs.into_iter().next().unwrap() }\n", + after: "pub fn f(ys: Vec) -> U { ys.into_iter().next().unwrap() }\n", + kind: Benign, + note: "rust generic param + param rename", + }, + Case { + lang: Lang::Rust, + anchor: "x.rs > f", + after_anchor: None, + before: "pub fn f(v: Vec) -> i64 { let mut s = 0; for it in v { s += it; } s }\n", + after: "pub fn f(w: Vec) -> i64 { let mut q = 0; for jt in w { q += jt; } q }\n", + kind: Benign, + note: "rust for-loop binder + locals", + }, + Case { + lang: Lang::Rust, + anchor: "x.rs > rotate", + after_anchor: Some("x.rs > renamed"), + before: "pub fn rotate(token: &str) -> String { token.to_string() }\n", + after: "pub fn renamed(token: &str) -> String { token.to_string() }\n", + kind: Benign, + note: "rust symbol's own name (rename-quiet, relocatable by hash)", + }, + Case { + lang: Lang::TypeScript, + anchor: "x.ts > S > m", + after_anchor: None, + before: "export class S {\n m(a: number): number { const x = a; return x + a; }\n}\n", + after: "export class S {\n m(b: number): number { const y = b; return y + b; }\n}\n", + kind: Benign, + note: "ts param + local rename", + }, + Case { + lang: Lang::TypeScript, + anchor: "x.ts > f", + after_anchor: None, + before: "export function f(o: O): number { const { k: a } = o; return a; }\n", + after: "export function f(o: O): number { const { k: b } = o; return b; }\n", + kind: Benign, + note: "ts destructuring binder rename (source key unchanged)", + }, + Case { + lang: Lang::TypeScript, + anchor: "x.ts > f", + after_anchor: None, + before: "export function f(xs: number[]): number { return xs.map((v) => v + 1)[0]; }\n", + after: "export function f(ys: number[]): number { return ys.map((w) => w + 1)[0]; }\n", + kind: Benign, + note: "ts arrow-fn param + param rename", + }, + Case { + lang: Lang::Python, + anchor: "x.py > f", + after_anchor: None, + before: "def f(a, b=1):\n x = a + b\n return x\n", + after: "def f(c, b=1):\n y = c + b\n return y\n", + kind: Benign, + note: "py param + local rename (default value untouched)", + }, + Case { + lang: Lang::Python, + anchor: "x.py > f", + after_anchor: None, + before: "def f(items):\n total = 0\n for it in items:\n total += it\n return total\n", + after: "def f(things):\n sum_ = 0\n for jt in things:\n sum_ += jt\n return sum_\n", + kind: Benign, + note: "py for-loop binder + augmented-assignment local", + }, + Case { + lang: Lang::Python, + anchor: "x.py > f", + after_anchor: None, + before: "def f(p):\n with open(p) as fh:\n return fh.read()\n", + after: "def f(q):\n with open(q) as handle:\n return handle.read()\n", + kind: Benign, + note: "py with-as alias + param rename", + }, + Case { + lang: Lang::Go, + anchor: "x.go > Builder > Set", + after_anchor: None, + before: "func (b *Builder) Set(n string) string { x := n; return x }\n", + after: "func (c *Builder) Set(m string) string { y := m; return y }\n", + kind: Benign, + note: "go receiver + param + short-var rename", + }, + Case { + lang: Lang::Go, + anchor: "x.go > Sum", + after_anchor: None, + before: "func Sum(xs []int) int {\n\ttotal := 0\n\tfor i, v := range xs {\n\t\ttotal += i + v\n\t}\n\treturn total\n}\n", + after: "func Sum(ys []int) int {\n\tsum := 0\n\tfor j, w := range ys {\n\t\tsum += j + w\n\t}\n\treturn sum\n}\n", + kind: Benign, + note: "go range binders + locals rename", + }, + Case { + lang: Lang::Go, + anchor: "x.go > Pair", + after_anchor: None, + before: "func Pair(a, b int) int { return a + b }\n", + after: "func Pair(c, d int) int { return c + d }\n", + kind: Benign, + note: "go grouped multi-name params rename", + }, + // ---- Semantic: single-occurrence FREE swaps must become loud under v2 --------------- + Case { + lang: Lang::TypeScript, + anchor: "x.ts > S > f", + after_anchor: None, + before: "export class S {\n f(): T { return PointsTier.TIER_1; }\n}\n", + after: "export class S {\n f(): T { return PointsTier.TIER_2; }\n}\n", + kind: Semantic, + note: "ts enum member swap (the original #77 repro)", + }, + Case { + lang: Lang::TypeScript, + anchor: "x.ts > S > f", + after_anchor: None, + before: "export class S {\n f(u: U): T { return Tiers.getHighest(u); }\n}\n", + after: "export class S {\n f(u: U): T { return Tiers.getLowest(u); }\n}\n", + kind: Semantic, + note: "ts method-call target swap", + }, + Case { + lang: Lang::TypeScript, + anchor: "x.ts > f", + after_anchor: None, + before: "export function f(x: number): number { return helper(x); }\n", + after: "export function f(x: number): number { return other(x); }\n", + kind: Semantic, + note: "ts bare free call-target swap", + }, + Case { + lang: Lang::TypeScript, + anchor: "x.ts > f", + after_anchor: None, + before: "export function f(o: O): number { const { k: a } = o; return a; }\n", + after: "export function f(o: O): number { const { j: a } = o; return a; }\n", + kind: Semantic, + note: "ts destructuring SOURCE key swap (reads a different member)", + }, + Case { + lang: Lang::Go, + anchor: "x.go > Builder > Set", + after_anchor: None, + before: "func (b *Builder) Set(n string) *Builder { return b.Del(n) }\n", + after: "func (b *Builder) Set(n string) *Builder { return b.Keep(n) }\n", + kind: Semantic, + note: "go field-method swap", + }, + Case { + lang: Lang::Go, + anchor: "x.go > F", + after_anchor: None, + before: "func F(x int) int { return helper(x) }\n", + after: "func F(x int) int { return other(x) }\n", + kind: Semantic, + note: "go bare free call-target swap", + }, + Case { + lang: Lang::Python, + anchor: "x.py > color", + after_anchor: None, + before: "def color(self):\n return ProbeColor.RED\n", + after: "def color(self):\n return ProbeColor.GREEN\n", + kind: Semantic, + note: "py attribute swap", + }, + // NB: a decorator-name swap is intentionally *not* in this table — v1 already catches it + // via the #8 special case, so it is not v1-blind. That v2 catches it through the general + // free-identifier rule (the special case subsumed) is asserted in hash.rs unit tests. + Case { + lang: Lang::Python, + anchor: "x.py > f", + after_anchor: None, + before: "def f(x):\n return helper(x)\n", + after: "def f(x):\n return other(x)\n", + kind: Semantic, + note: "py bare free call-target swap", + }, + Case { + lang: Lang::Rust, + anchor: "x.rs > f", + after_anchor: None, + before: "pub fn f(p: P) -> i64 { p.first }\n", + after: "pub fn f(p: P) -> i64 { p.second }\n", + kind: Semantic, + note: "rust field-access swap", + }, + Case { + lang: Lang::Rust, + anchor: "x.rs > f", + after_anchor: None, + before: "pub fn f(x: i64) -> i64 { helper(x) }\n", + after: "pub fn f(x: i64) -> i64 { other(x) }\n", + kind: Semantic, + note: "rust bare free call-target swap", + }, + Case { + lang: Lang::Rust, + anchor: "x.rs > f", + after_anchor: None, + before: "pub fn f(x: Foo) -> i64 { 0 }\n", + after: "pub fn f(x: Bar) -> i64 { 0 }\n", + kind: Semantic, + note: "rust external param-type swap (contract change)", + }, + Case { + lang: Lang::Go, + anchor: "x.go > F", + after_anchor: None, + before: "func F(x Foo) int { return 0 }\n", + after: "func F(x Bar) int { return 0 }\n", + kind: Semantic, + note: "go external param-type swap (contract change)", + }, + ] +} + +#[test] +fn v2_is_quiet_on_benign_renames_and_loud_on_free_swaps() { + let mut benign = 0usize; + let mut semantic = 0usize; + for c in cases() { + let after_anchor = c.after_anchor.unwrap_or(c.anchor); + let v1_before = raw(c.before, c.lang, c.anchor, Recipe::V1); + let v1_after = raw(c.after, c.lang, after_anchor, Recipe::V1); + let v2_before = raw(c.before, c.lang, c.anchor, Recipe::V2); + let v2_after = raw(c.after, c.lang, after_anchor, Recipe::V2); + match c.kind { + Benign => { + // Rename tolerance: a consistent bound-name rename must be invisible to v2. + assert_eq!( + v2_before, v2_after, + "BENIGN REGRESSION: v2 fired on a consistent bound rename [{}]", + c.note + ); + benign += 1; + } + Semantic => { + // Each is single-occurrence free, so v1 is blind — that is the bug. + assert_eq!( + v1_before, v1_after, + "harness setup: semantic case is not v1-blind [{}]", + c.note + ); + // v2 must catch what v1 missed. + assert_ne!( + v2_before, v2_after, + "MISSED DRIFT: v2 stayed quiet on a free-identifier swap [{}]", + c.note + ); + semantic += 1; + } + } + } + // The headline metrics from the issue's validation gate, made executable. + assert!(benign >= 12, "expected a broad benign corpus, got {benign}"); + assert!( + semantic >= 12, + "expected a broad semantic corpus, got {semantic}" + ); + eprintln!( + "differential: {benign} benign (0 regressions), {semantic} semantic (100% v2 catch, 0% v1)" + ); +} + +/// The accepted approximation, pinned so it's a documented limit not a surprise (see +/// `docs/reference/hash-recipes.md`). Match-arm pattern identifiers are left free: a unit-variant +/// swap is caught (good), but renaming a catch-all binding is also loud (an accepted false +/// positive). +#[test] +fn accepted_residue_match_arm_identifiers_are_free() { + // Safe direction: matching a different unit variant is a real change and is caught. + let variant_a = "pub fn f(c: Color) -> i64 { match c { Color::Red => 1, _ => 0 } }\n"; + let variant_b = "pub fn f(c: Color) -> i64 { match c { Color::Blue => 1, _ => 0 } }\n"; + assert_ne!( + raw(variant_a, Lang::Rust, "x.rs > f", Recipe::V2), + raw(variant_b, Lang::Rust, "x.rs > f", Recipe::V2), + "a unit-variant swap must stay loud", + ); + + // Accepted cost: renaming a catch-all binding is loud under v2 (it is left free), where v1 was + // quiet. This is the one benign-edit class v2 does not keep silent — documented, not a bug. + let bind_a = "pub fn f(o: Option) -> i64 { match o { Some(x) => x, None => 0 } }\n"; + let bind_b = "pub fn f(o: Option) -> i64 { match o { Some(y) => y, None => 0 } }\n"; + assert_eq!( + raw(bind_a, Lang::Rust, "x.rs > f", Recipe::V1), + raw(bind_b, Lang::Rust, "x.rs > f", Recipe::V1), + "v1 is quiet on the match-binding rename", + ); + assert_ne!( + raw(bind_a, Lang::Rust, "x.rs > f", Recipe::V2), + raw(bind_b, Lang::Rust, "x.rs > f", Recipe::V2), + "residue changed: revisit the documented limit in docs/reference/hash-recipes.md", + ); +} diff --git a/surf-core/tests/golden_hash.rs b/surf-core/tests/golden_hash.rs index 77c4fa6..fbe6fac 100644 --- a/surf-core/tests/golden_hash.rs +++ b/surf-core/tests/golden_hash.rs @@ -40,10 +40,11 @@ fn raw(src: &str, lang: Lang, anchor: &str, recipe: Recipe) -> String { #[test] fn golden_hashes_are_stable_per_language() { // Each snippet carries a comment and non-canonical whitespace on purpose, so the golden - // already encodes the "comments + formatting are ignored" guarantee. These snippets are - // member-access-free, so v1 and v2 agree byte-for-byte — itself the guarantee that v2 only - // diverges on member-access names (#140). The frozen v1 digests are unchanged from before - // versioning, so existing v1 stamps in downstream repos still verify. + // already encodes the "comments + formatting are ignored" guarantee. The frozen v1 digests + // are unchanged from before versioning, so existing v1 stamps in downstream repos still + // verify. A snippet whose only identifiers are *bound* (params, the symbol's own name) and + // operators hashes the same under v1 and v2 — the bound/free split (#77) changes nothing + // when there is no free identifier to emit verbatim. let rust = "pub fn add(a: i64, b: i64) -> i64 {\n // sum them\n a + b\n}\n"; assert_eq!( raw(rust, Lang::Rust, "x.rs > add", Recipe::V1), @@ -65,16 +66,6 @@ fn golden_hashes_are_stable_per_language() { "afa4514b5c89" ); - let tsx = "export function App(): JSX.Element {\n return
{1 + 2}
;\n}\n"; - assert_eq!( - raw(tsx, Lang::Tsx, "x.tsx > App", Recipe::V1), - "97e0de58725d" - ); - assert_eq!( - raw(tsx, Lang::Tsx, "x.tsx > App", Recipe::V2), - "97e0de58725d" - ); - let py = "def add(a, b):\n # comment\n return a + b\n"; assert_eq!( raw(py, Lang::Python, "x.py > add", Recipe::V1), @@ -85,9 +76,22 @@ fn golden_hashes_are_stable_per_language() { "879b76118966" ); + // These two carry *free* identifiers — JSX element/type names, and Go's `int` type — so the + // bound/free split makes v2 diverge from v1: the free names are now verbatim, so re-pointing + // at a different tag or type is loud. Both digests are pinned. + let tsx = "export function App(): JSX.Element {\n return
{1 + 2}
;\n}\n"; + assert_eq!( + raw(tsx, Lang::Tsx, "x.tsx > App", Recipe::V1), + "97e0de58725d" + ); + assert_eq!( + raw(tsx, Lang::Tsx, "x.tsx > App", Recipe::V2), + "92e69aab47fb" + ); + let go = "func Add(a int, b int) int {\n\t// sum\n\treturn a + b\n}\n"; assert_eq!(raw(go, Lang::Go, "x.go > Add", Recipe::V1), "942af2641116"); - assert_eq!(raw(go, Lang::Go, "x.go > Add", Recipe::V2), "942af2641116"); + assert_eq!(raw(go, Lang::Go, "x.go > Add", Recipe::V2), "5bb84c760e6b"); // The stored stamp for a single-site anchor carries the current-recipe (v2) prefix. assert_eq!(h(rust, Lang::Rust, "x.rs > add"), "2:f1075e760a17"); @@ -99,8 +103,9 @@ fn golden_unicode_identifier_hashes_are_stable() { // Non-ASCII symbol names and bodies across the four families (#45). Pinning these as goldens // turns any future locale/encoding sensitivity in canonicalization into a loud diff. Each // snippet carries a comment + non-canonical whitespace, so it also re-asserts the - // "comments + formatting ignored" guarantee for Unicode source. All are member-access-free, - // so v1 and v2 agree byte-for-byte. + // "comments + formatting ignored" guarantee for Unicode source. The Rust/TS/Python snippets + // have only bound identifiers, so v1 and v2 agree; the Go one carries the free `int` type, so + // v2 diverges under the bound/free split. let rust = "pub fn café(δ: i64) -> i64 {\n // accent\n δ\n}\n"; assert_eq!( raw(rust, Lang::Rust, "x.rs > café", Recipe::V1), @@ -133,7 +138,7 @@ fn golden_unicode_identifier_hashes_are_stable() { let go = "func Café(δ int) int {\n\t// u\n\treturn δ\n}\n"; assert_eq!(raw(go, Lang::Go, "x.go > Café", Recipe::V1), "9a101a4d062f"); - assert_eq!(raw(go, Lang::Go, "x.go > Café", Recipe::V2), "9a101a4d062f"); + assert_eq!(raw(go, Lang::Go, "x.go > Café", Recipe::V2), "51c5edab6591"); } #[test] @@ -149,9 +154,12 @@ fn unicode_identifier_hashes_are_recomputation_stable() { #[test] fn golden_member_access_hashes_differ_by_recipe() { - // Symbols whose only interesting content is a member access: v1 and v2 diverge, and both - // digests are pinned so a grammar bump or canonicalization refactor that perturbs either - // recipe is a loud, intentional signal (the #140 probes, one per family). + // Symbols carrying member accesses and free references: v1 (alpha-rename everything) and v2 + // (the bound/free split) diverge, and both digests are pinned so a grammar bump or + // canonicalization refactor that perturbs either recipe is a loud, intentional signal (the + // #77/#140 probes, one per family). v2 here also emits the free *receivers*/types verbatim + // (`Tiers`, `PointsTier`, `User`, `Tier`, `Person`), not only the member names — the + // difference between the full split and the member-only first cut. let ts = "export class S {\n tier(u: User): Tier {\n return Tiers.getHighest(u.nxp, PointsTier.TIER_1);\n }\n}\n"; assert_eq!( raw(ts, Lang::TypeScript, "x.ts > S > tier", Recipe::V1), @@ -159,7 +167,7 @@ fn golden_member_access_hashes_differ_by_recipe() { ); assert_eq!( raw(ts, Lang::TypeScript, "x.ts > S > tier", Recipe::V2), - "2c5e43fc3a1f" + "96e55763b827" ); let go = "func (b *Builder) Set(n string) *Builder {\n\treturn b.Del(n)\n}\n"; @@ -169,7 +177,7 @@ fn golden_member_access_hashes_differ_by_recipe() { ); assert_eq!( raw(go, Lang::Go, "x.go > Builder > Set", Recipe::V2), - "e5bc7182a348" + "e6ab4bb83933" ); let py = "def color(self):\n return ProbeColor.RED\n"; @@ -179,7 +187,7 @@ fn golden_member_access_hashes_differ_by_recipe() { ); assert_eq!( raw(py, Lang::Python, "x.py > color", Recipe::V2), - "ccf224e8a6fc" + "0a9441f535d0" ); let rs = "pub fn name(p: Person) -> String {\n p.first.clone()\n}\n"; @@ -189,7 +197,7 @@ fn golden_member_access_hashes_differ_by_recipe() { ); assert_eq!( raw(rs, Lang::Rust, "x.rs > name", Recipe::V2), - "a0a671650fb6" + "31f85c893b1e" ); }