From 962399b053721cbdd10a563141a053142367c388 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 3 Jul 2026 00:10:21 +0800
Subject: [PATCH 01/38] feat(dpmodel): segment_max + numerically-stable
 mask-aware segment_softmax

Built on the existing xp_maximum_at (no new array_api helper needed).
Part of NeighborGraph PR-D (graph-native attention).
---
 .../dpmodel/utils/neighbor_graph/__init__.py  |  4 +
 .../dpmodel/utils/neighbor_graph/segment.py   | 50 ++++++++++
 .../common/dpmodel/test_segment_softmax.py    | 94 +++++++++++++++++++
 3 files changed, 148 insertions(+)
 create mode 100644 source/tests/common/dpmodel/test_segment_softmax.py

diff --git a/deepmd/dpmodel/utils/neighbor_graph/__init__.py b/deepmd/dpmodel/utils/neighbor_graph/__init__.py
index 6e041805b2..19432698cd 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/__init__.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/__init__.py
@@ -33,7 +33,9 @@
     pad_and_guard_edges,
 )
 from .segment import (
+    segment_max,
     segment_mean,
+    segment_softmax,
     segment_sum,
 )
 
@@ -49,6 +51,8 @@
     "neighbor_graph_from_ijs",
     "node_validity_mask",
     "pad_and_guard_edges",
+    "segment_max",
     "segment_mean",
+    "segment_softmax",
     "segment_sum",
 ]
diff --git a/deepmd/dpmodel/utils/neighbor_graph/segment.py b/deepmd/dpmodel/utils/neighbor_graph/segment.py
index 45d64af08c..6f6d946f77 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/segment.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/segment.py
@@ -9,6 +9,7 @@
 from deepmd.dpmodel.array_api import (
     Array,
     xp_add_at,
+    xp_maximum_at,
 )
 
 
@@ -37,3 +38,52 @@ def segment_mean(data: Array, segment_ids: Array, num_segments: int) -> Array:
     # broadcast counts over the trailing dims of summed
     shape = (num_segments,) + (1,) * (summed.ndim - 1)
     return summed / xp.reshape(safe, shape)
+
+
+def segment_max(data: Array, segment_ids: Array, num_segments: int) -> Array:
+    """out[s] = max of data[i] over i with segment_ids[i] == s.
+
+    Shape ``(num_segments, *data.shape[1:])``; empty segments are ``-inf``
+    (neutral element — callers guard with masks before consuming them).
+    """
+    xp = array_api_compat.array_namespace(data)
+    out = xp.full(
+        (num_segments, *tuple(data.shape[1:])),
+        -xp.inf,
+        dtype=data.dtype,
+        device=array_api_compat.device(data),
+    )
+    return xp_maximum_at(out, segment_ids, data)
+
+
+def segment_softmax(
+    data: Array,
+    segment_ids: Array,
+    num_segments: int,
+    mask: Array | None = None,
+) -> Array:
+    """Softmax over entries sharing a segment id, numerically stable.
+
+    Mirrors the dense ``np_softmax`` max-subtraction trick with a PER-SEGMENT
+    max. ``mask`` (bool, per entry) removes masked entries from the softmax
+    entirely (zero weight AND excluded from the denominator). Empty or
+    fully-masked segments produce all-zero weights (no NaN).
+    """
+    xp = array_api_compat.array_namespace(data)
+    if mask is not None:
+        # keep masked entries out of the per-segment max: send them to -inf
+        neg = xp.full_like(data, -xp.inf)
+        data_for_max = xp.where(mask, data, neg)
+    else:
+        data_for_max = data
+    seg_max = segment_max(data_for_max, segment_ids, num_segments)
+    # guard -inf (empty / fully-masked segments) so gather doesn't yield inf-inf
+    seg_max = xp.where(xp.isinf(seg_max), xp.zeros_like(seg_max), seg_max)
+    shifted = data - xp.take(seg_max, segment_ids, axis=0)
+    ex = xp.exp(shifted)
+    if mask is not None:
+        ex = ex * xp.astype(mask, ex.dtype)
+    denom = segment_sum(ex, segment_ids, num_segments)
+    denom_e = xp.take(denom, segment_ids, axis=0)
+    safe = xp.where(denom_e > 0, denom_e, xp.ones_like(denom_e))
+    return ex / safe
diff --git a/source/tests/common/dpmodel/test_segment_softmax.py b/source/tests/common/dpmodel/test_segment_softmax.py
new file mode 100644
index 0000000000..b34ee8efaf
--- /dev/null
+++ b/source/tests/common/dpmodel/test_segment_softmax.py
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""segment_max / segment_softmax (NeighborGraph PR-D segment toolkit)."""
+
+import numpy as np
+
+from deepmd.dpmodel.utils.neighbor_graph import (
+    segment_max,
+    segment_softmax,
+)
+
+
+class TestSegmentMax:
+    def test_basic(self) -> None:
+        data = np.array([1.0, 5.0, 2.0, -3.0])
+        ids = np.array([0, 0, 2, 2], dtype=np.int64)
+        out = segment_max(data, ids, 3)
+        assert out[0] == 5.0
+        assert np.isneginf(out[1])  # empty segment
+        assert out[2] == 2.0
+
+    def test_trailing_dims(self) -> None:
+        data = np.array([[1.0, -2.0], [3.0, -4.0], [0.0, 9.0]])
+        ids = np.array([1, 1, 0], dtype=np.int64)
+        out = segment_max(data, ids, 2)
+        np.testing.assert_allclose(out[0], [0.0, 9.0])
+        np.testing.assert_allclose(out[1], [3.0, -2.0])
+
+    def test_torch_matches_numpy(self) -> None:
+        import torch
+
+        data = np.array([0.3, 1.2, -0.7, 2.0])
+        ids = np.array([0, 0, 1, 1], dtype=np.int64)
+        ref = segment_max(data, ids, 2)
+        out = segment_max(torch.from_numpy(data), torch.from_numpy(ids), 2)
+        np.testing.assert_allclose(out.numpy(), ref)
+
+
+class TestSegmentSoftmax:
+    def test_matches_dense(self) -> None:
+        logits = np.array([1.0, 2.0, 0.5, -1.0])
+        ids = np.array([0, 0, 0, 1], dtype=np.int64)
+        w = segment_softmax(logits, ids, 2)
+        ref0 = np.exp(np.array([1.0, 2.0, 0.5]) - 2.0)
+        ref0 = ref0 / ref0.sum()
+        np.testing.assert_allclose(w[:3], ref0, atol=1e-12)
+        np.testing.assert_allclose(w[3], 1.0, atol=1e-12)
+
+    def test_stable_large_logits(self) -> None:
+        logits = np.array([1e30, 1e30 + 1.0])
+        ids = np.array([0, 0], dtype=np.int64)
+        w = segment_softmax(logits, ids, 1)
+        assert not np.any(np.isnan(w))
+        np.testing.assert_allclose(w.sum(), 1.0, atol=1e-12)
+
+    def test_masked_entries_zero(self) -> None:
+        logits = np.array([1.0, 2.0, 3.0])
+        ids = np.array([0, 0, 0], dtype=np.int64)
+        mask = np.array([True, False, True])
+        w = segment_softmax(logits, ids, 1, mask=mask)
+        assert w[1] == 0.0
+        np.testing.assert_allclose(w.sum(), 1.0, atol=1e-12)
+        # masked entry excluded from the denominator too
+        ref = np.exp(np.array([1.0, 3.0]) - 3.0)
+        ref = ref / ref.sum()
+        np.testing.assert_allclose(w[[0, 2]], ref, atol=1e-12)
+
+    def test_all_masked_segment_is_zero_no_nan(self) -> None:
+        logits = np.array([1.0, 2.0, 5.0])
+        ids = np.array([0, 0, 1], dtype=np.int64)
+        mask = np.array([True, True, False])
+        w = segment_softmax(logits, ids, 2, mask=mask)
+        assert not np.any(np.isnan(w))
+        assert w[2] == 0.0
+
+    def test_empty_segment_no_nan(self) -> None:
+        logits = np.array([1.0, 2.0])
+        ids = np.array([0, 0], dtype=np.int64)
+        w = segment_softmax(logits, ids, 3)
+        assert not np.any(np.isnan(w))
+
+    def test_torch_matches_numpy(self) -> None:
+        import torch
+
+        logits = np.array([0.3, 1.2, -0.7, 2.0])
+        ids = np.array([0, 0, 1, 1], dtype=np.int64)
+        mask = np.array([True, True, True, False])
+        ref = segment_softmax(logits, ids, 2, mask=mask)
+        out = segment_softmax(
+            torch.from_numpy(logits),
+            torch.from_numpy(ids),
+            2,
+            mask=torch.from_numpy(mask),
+        )
+        np.testing.assert_allclose(out.numpy(), ref, atol=1e-12)

From d3132d52ffa123ed5ad360ba955a6bf763fea3c3 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 3 Jul 2026 00:13:03 +0800
Subject: [PATCH 02/38] feat(dpmodel): center_edge_pairs primitive (shared by
 attention/angles)

Segment-based (global (E,E) boolean deliberately avoided): compact eager
form for carry-all graphs + shape-static nonzero-free form for the
center-major static layout (jit/export/make_fx traceable).
Part of NeighborGraph PR-D; PR-E angles reuse (unordered, no-self).
---
 .../dpmodel/utils/neighbor_graph/__init__.py  |   4 +
 deepmd/dpmodel/utils/neighbor_graph/pairs.py  | 172 ++++++++++++++++++
 .../common/dpmodel/test_center_edge_pairs.py  | 133 ++++++++++++++
 3 files changed, 309 insertions(+)
 create mode 100644 deepmd/dpmodel/utils/neighbor_graph/pairs.py
 create mode 100644 source/tests/common/dpmodel/test_center_edge_pairs.py

diff --git a/deepmd/dpmodel/utils/neighbor_graph/__init__.py b/deepmd/dpmodel/utils/neighbor_graph/__init__.py
index 19432698cd..24fb090309 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/__init__.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/__init__.py
@@ -32,6 +32,9 @@
     node_validity_mask,
     pad_and_guard_edges,
 )
+from .pairs import (
+    center_edge_pairs,
+)
 from .segment import (
     segment_max,
     segment_mean,
@@ -44,6 +47,7 @@
     "NeighborGraph",
     "build_neighbor_graph",
     "build_neighbor_graph_ase",
+    "center_edge_pairs",
     "edge_env_mat",
     "edge_force_virial",
     "frame_id_from_n_node",
diff --git a/deepmd/dpmodel/utils/neighbor_graph/pairs.py b/deepmd/dpmodel/utils/neighbor_graph/pairs.py
new file mode 100644
index 0000000000..3a8a5b6299
--- /dev/null
+++ b/deepmd/dpmodel/utils/neighbor_graph/pairs.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Pairs of edges sharing a center (``dst``) — the edge-pair axis.
+
+Shared primitive: graph-native attention (NeighborGraph PR-D) uses
+``(ordered=True, include_self=True)`` = the full transformer neighbor-pair
+square per center; 3-body angles (PR-E) use ``(ordered=False,
+include_self=False)``.
+
+Two forms:
+
+- **compact eager** (``static_nnei=None``): segment-based enumeration over the
+  real edges only — sort edge ids by center, expand each center's Cartesian
+  square via cumsum offsets. Dynamic ``P = sum(deg**2)``; memory ``O(P)``
+  (same order as dense attention's ``O(nloc * nnei**2)``). Uses data-dependent
+  shapes (``nonzero``) so it is EAGER-ONLY.
+- **shape-static** (``static_nnei`` set): assumes the center-major static
+  layout (``E = n_center * static_nnei``, edge ``c * static_nnei + m`` belongs
+  to center ``c`` — the layout ``from_dense_quartet(compact=False)`` emits).
+  Pure arange/reshape arithmetic, ``P = n_center * static_nnei**2`` with all
+  pairs materialized and validity carried by ``pair_mask`` — no data-dependent
+  ops, so it stays jit/export/make_fx-traceable.
+
+A global ``(E, E)`` same-center boolean is deliberately NOT used: with
+``E ~ N * nnei`` it costs ``O(N**2 * nnei**2)`` memory.
+"""
+
+from __future__ import annotations
+
+from typing import (
+    Any,
+)
+
+import array_api_compat
+
+from deepmd.dpmodel.array_api import (
+    Array,
+    xp_add_at,
+)
+
+
+def center_edge_pairs(
+    dst: Array,
+    edge_mask: Array,
+    n_total: int,
+    *,
+    include_self: bool = True,
+    ordered: bool = True,
+    static_nnei: int | None = None,
+) -> tuple[Array, Array, Array]:
+    """Enumerate pairs of edges sharing a center.
+
+    Parameters
+    ----------
+    dst : Array
+        (E,) int64 center of each edge (``edge_index[1]``).
+    edge_mask : Array
+        (E,) bool, real (True) vs padding (False) edges.
+    n_total : int
+        Number of centers (bounds ``dst``).
+    include_self : bool
+        Keep the ``m == n`` diagonal (transformer self-attention needs it).
+    ordered : bool
+        Keep both ``(m, n)`` and ``(n, m)`` (attention: yes, ``q_m . k_n`` is
+        not symmetric). ``False`` keeps only ``n >= m`` (with
+        ``include_self=False``: ``n > m`` — the angle set).
+    static_nnei : int | None
+        ``None`` -> compact eager form. Set -> shape-static form assuming the
+        center-major layout ``E = n_center * static_nnei``.
+
+    Returns
+    -------
+    query_edge : Array
+        (P,) int64 edge index of the query (``m``).
+    key_edge : Array
+        (P,) int64 edge index of the key (``n``).
+    pair_mask : Array
+        (P,) bool; False where either edge is padding or the pair is filtered
+        by the ``include_self`` / ``ordered`` policy (shape-static form; the
+        compact form drops such pairs and returns all-True).
+    """
+    xp = array_api_compat.array_namespace(dst)
+    dev = array_api_compat.device(dst)
+    if static_nnei is not None:
+        return _pairs_shape_static(
+            xp, dev, dst, edge_mask, static_nnei, include_self, ordered
+        )
+    return _pairs_compact(xp, dev, dst, edge_mask, n_total, include_self, ordered)
+
+
+def _pairs_shape_static(
+    xp: Any,
+    dev: Any,
+    dst: Array,
+    edge_mask: Array,
+    nn: int,
+    include_self: bool,
+    ordered: bool,
+) -> tuple[Array, Array, Array]:
+    e_tot = dst.shape[0]
+    # (E, nn): every edge queries the nn slots of its own center block
+    eids = xp.arange(e_tot, dtype=xp.int64, device=dev)
+    base = (eids // nn) * nn  # start of each edge's center block
+    slots = xp.arange(nn, dtype=xp.int64, device=dev)
+    q2 = xp.broadcast_to(eids[:, None], (e_tot, nn))
+    k2 = base[:, None] + slots[None, :]
+    query_edge = xp.reshape(q2, (-1,))
+    key_edge = xp.reshape(k2, (-1,))
+    pair_mask = xp.take(edge_mask, query_edge, axis=0) & xp.take(
+        edge_mask, key_edge, axis=0
+    )
+    if not include_self:
+        pair_mask = pair_mask & (query_edge != key_edge)
+    if not ordered:
+        pair_mask = pair_mask & (key_edge >= query_edge)
+    return query_edge, key_edge, pair_mask
+
+
+def _pairs_compact(
+    xp: Any,
+    dev: Any,
+    dst: Array,
+    edge_mask: Array,
+    n_total: int,
+    include_self: bool,
+    ordered: bool,
+) -> tuple[Array, Array, Array]:
+    empty = (
+        xp.zeros((0,), dtype=xp.int64, device=dev),
+        xp.zeros((0,), dtype=xp.int64, device=dev),
+        xp.zeros((0,), dtype=xp.bool, device=dev),
+    )
+    if dst.shape[0] == 0:
+        return empty
+    # real edges only, grouped by center (stable sort keeps original order
+    # within a center — irrelevant for correctness, deterministic for tests)
+    (real_idx,) = xp.nonzero(edge_mask)
+    r_tot = real_idx.shape[0]
+    if r_tot == 0:
+        return empty
+    d_real = xp.take(dst, real_idx, axis=0)
+    order = xp.argsort(d_real, stable=True)
+    eid = xp.take(real_idx, order, axis=0)  # (R,) edge ids, center-grouped
+    ds = xp.take(d_real, order, axis=0)  # (R,) sorted centers
+    # per-center degree and group start (over the sorted layout)
+    ones = xp.ones((r_tot,), dtype=xp.int64, device=dev)
+    counts = xp_add_at(
+        xp.zeros((n_total,), dtype=xp.int64, device=dev), ds, ones
+    )  # (n_total,)
+    csum = xp.cumulative_sum(counts)
+    start = csum - counts  # (n_total,) group start per center
+    deg = xp.take(counts, ds, axis=0)  # (R,) degree of each edge's center
+    # each sorted edge t emits deg[t] pairs; P = sum(deg**2)
+    query_sorted = xp.repeat(xp.arange(r_tot, dtype=xp.int64, device=dev), deg)  # (P,)
+    # within each query's block, a 0..deg-1 ramp indexes the key group
+    pair_off = xp.cumulative_sum(deg) - deg  # (R,) exclusive prefix of deg
+    p_tot = query_sorted.shape[0]
+    ramp = xp.arange(p_tot, dtype=xp.int64, device=dev) - xp.take(
+        pair_off, query_sorted, axis=0
+    )
+    key_sorted = xp.take(start, xp.take(ds, query_sorted, axis=0), axis=0) + ramp
+    query_edge = xp.take(eid, query_sorted, axis=0)
+    key_edge = xp.take(eid, key_sorted, axis=0)
+    keep = xp.ones((p_tot,), dtype=xp.bool, device=dev)
+    if not include_self:
+        keep = keep & (query_edge != key_edge)
+    if not ordered:
+        keep = keep & (key_edge >= query_edge)
+    (kept,) = xp.nonzero(keep)
+    query_edge = xp.take(query_edge, kept, axis=0)
+    key_edge = xp.take(key_edge, kept, axis=0)
+    pair_mask = xp.ones((query_edge.shape[0],), dtype=xp.bool, device=dev)
+    return query_edge, key_edge, pair_mask
diff --git a/source/tests/common/dpmodel/test_center_edge_pairs.py b/source/tests/common/dpmodel/test_center_edge_pairs.py
new file mode 100644
index 0000000000..aa018b6bb8
--- /dev/null
+++ b/source/tests/common/dpmodel/test_center_edge_pairs.py
@@ -0,0 +1,133 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""center_edge_pairs: pairs of edges sharing a center (NeighborGraph PR-D/E)."""
+
+import numpy as np
+
+from deepmd.dpmodel.utils.neighbor_graph import (
+    center_edge_pairs,
+)
+
+
+def _oracle(dst, mask, include_self, ordered):
+    pairs = []
+    for m in range(len(dst)):
+        if not mask[m]:
+            continue
+        for n in range(len(dst)):
+            if not mask[n] or dst[m] != dst[n]:
+                continue
+            if not include_self and m == n:
+                continue
+            if not ordered and n < m:
+                continue
+            pairs.append((m, n))
+    return set(pairs)
+
+
+def _got(q, k, pm):
+    return {(int(q[p]), int(k[p])) for p in range(q.shape[0]) if pm[p]}
+
+
+class TestCompact:
+    def test_transformer_all_ordered_with_self(self) -> None:
+        # 3 edges: dst = [0, 0, 1]; center 0 has edges {0,1}, center 1 has {2}
+        dst = np.array([0, 0, 1], dtype=np.int64)
+        edge_mask = np.array([True, True, True])
+        q, k, pm = center_edge_pairs(dst, edge_mask, 2)
+        assert _got(q, k, pm) == _oracle([0, 0, 1], [1, 1, 1], True, True)
+        # center 0: (0,0),(0,1),(1,0),(1,1); center 1: (2,2) => 5 pairs
+        assert len(_got(q, k, pm)) == 5
+
+    def test_unordered_no_self_is_angle_set(self) -> None:
+        dst = np.array([0, 0, 0], dtype=np.int64)
+        edge_mask = np.array([True, True, True])
+        q, k, pm = center_edge_pairs(
+            dst, edge_mask, 1, include_self=False, ordered=False
+        )
+        assert _got(q, k, pm) == {(0, 1), (0, 2), (1, 2)}
+
+    def test_ignores_padding_edges(self) -> None:
+        dst = np.array([0, 0, 0], dtype=np.int64)
+        edge_mask = np.array([True, True, False])  # 3rd is a guard edge
+        q, k, pm = center_edge_pairs(dst, edge_mask, 1)
+        assert _got(q, k, pm) == {(0, 0), (0, 1), (1, 0), (1, 1)}
+
+    def test_non_contiguous_center_order(self) -> None:
+        # edges NOT sorted by center: dst = [1, 0, 1, 0]
+        dst = np.array([1, 0, 1, 0], dtype=np.int64)
+        edge_mask = np.array([True, True, True, True])
+        q, k, pm = center_edge_pairs(dst, edge_mask, 2)
+        assert _got(q, k, pm) == _oracle([1, 0, 1, 0], [1] * 4, True, True)
+
+    def test_empty(self) -> None:
+        dst = np.zeros((0,), dtype=np.int64)
+        edge_mask = np.zeros((0,), dtype=bool)
+        q, k, pm = center_edge_pairs(dst, edge_mask, 3)
+        assert q.shape[0] == 0 and k.shape[0] == 0 and pm.shape[0] == 0
+
+    def test_random_vs_oracle(self) -> None:
+        rng = np.random.default_rng(7)
+        dst = rng.integers(0, 5, size=23).astype(np.int64)
+        edge_mask = rng.random(23) > 0.3
+        for include_self in (True, False):
+            for ordered in (True, False):
+                q, k, pm = center_edge_pairs(
+                    dst, edge_mask, 5, include_self=include_self, ordered=ordered
+                )
+                assert _got(q, k, pm) == _oracle(
+                    dst, edge_mask, include_self, ordered
+                ), (include_self, ordered)
+
+    def test_torch_matches_numpy(self) -> None:
+        import torch
+
+        dst = np.array([0, 0, 1, 1, 1], dtype=np.int64)
+        edge_mask = np.array([True, False, True, True, True])
+        ref = _got(*center_edge_pairs(dst, edge_mask, 2))
+        q, k, pm = center_edge_pairs(
+            torch.from_numpy(dst), torch.from_numpy(edge_mask), 2
+        )
+        assert _got(q.numpy(), k.numpy(), pm.numpy()) == ref
+
+
+class TestShapeStatic:
+    def test_matches_compact(self) -> None:
+        # center-major static layout: 2 centers x static_nnei=3, edges 2,5 padded
+        dst = np.array([0, 0, 0, 1, 1, 1], dtype=np.int64)
+        edge_mask = np.array([True, True, False, True, True, False])
+        qc, kc, pmc = center_edge_pairs(dst, edge_mask, 2)
+        qs, ks, pms = center_edge_pairs(dst, edge_mask, 2, static_nnei=3)
+        assert qs.shape[0] == 2 * 3 * 3  # static P, data-independent
+        assert _got(qs, ks, pms) == _got(qc, kc, pmc)
+
+    def test_flags_and_masking(self) -> None:
+        dst = np.array([0, 0, 0, 1, 1, 1], dtype=np.int64)
+        edge_mask = np.array([True, True, True, True, False, False])
+        for include_self in (True, False):
+            for ordered in (True, False):
+                qs, ks, pms = center_edge_pairs(
+                    dst,
+                    edge_mask,
+                    2,
+                    include_self=include_self,
+                    ordered=ordered,
+                    static_nnei=3,
+                )
+                assert qs.shape[0] == 2 * 3 * 3  # P static regardless of flags
+                assert _got(qs, ks, pms) == _oracle(
+                    dst, edge_mask, include_self, ordered
+                ), (include_self, ordered)
+
+    def test_torch_matches_numpy(self) -> None:
+        import torch
+
+        dst = np.array([0, 0, 1, 1], dtype=np.int64)
+        edge_mask = np.array([True, False, True, True])
+        ref = _got(*center_edge_pairs(dst, edge_mask, 2, static_nnei=2))
+        q, k, pm = center_edge_pairs(
+            torch.from_numpy(dst),
+            torch.from_numpy(edge_mask),
+            2,
+            static_nnei=2,
+        )
+        assert _got(q.numpy(), k.numpy(), pm.numpy()) == ref

From 05fbd8fd3b46fba57388668988c18ce32055bbd1 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 3 Jul 2026 00:30:12 +0800
Subject: [PATCH 03/38] feat(dpmodel): graph-native se_atten transformer
 attention (attn_layer > 0)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DescrptBlockSeAtten.call_graph grows _graph_attention: the dense per-center
(nnei, nnei) attention square becomes the edge-pair axis (center_edge_pairs,
ordered + self-included), softmax over keys becomes segment_softmax grouped
by the query edge. Op-for-op mirror of GatedAttentionLayer.call (head_dim
QKV slicing, normalize q/k/v, temperature/scaling, smooth shift trick,
post-softmax sw and dotr weighting, residual + LayerNorm per layer).

- shape-static adapter path (static_nnei threaded from the dense call
  adapter): bit-exact vs the dense body, rtol 1e-12, full flag matrix
  (attn_layer 1/2 x dotr x smooth x normalize x temperature, binding and
  non-binding sel).
- carry-all (compact) graphs: exact for non-smooth; for smooth the dense
  branch keeps sel-padding slots in the softmax denominator (dense output is
  sel-DEPENDENT, up to ~1e-4) — the carry-all form drops those phantom terms
  by design (user decision 2026-07-03), pinned by a clean-divergence test.
- edge_env_mat(return_sw=True) exposes the per-edge switch (zeroed on
  padding) for the smooth branch.
- uses_graph_lower: attention configs are now graph-eligible (concat tebd,
  no exclude_types still required).
---
 deepmd/dpmodel/descriptor/dpa1.py             | 186 ++++++++++++++--
 deepmd/dpmodel/utils/neighbor_graph/env.py    |  15 +-
 .../dpmodel/test_dpa1_call_graph_block.py     |   7 +-
 .../test_dpa1_graph_attention_parity.py       | 210 ++++++++++++++++++
 4 files changed, 392 insertions(+), 26 deletions(-)
 create mode 100644 source/tests/common/dpmodel/test_dpa1_graph_attention_parity.py

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index 13f8f3e351..b498d3d296 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -428,15 +428,14 @@ def get_numb_attn_layer(self) -> int:
     def uses_graph_lower(self) -> bool:
         """Returns whether this descriptor supports the graph-native lower.
 
-        The graph-native energy lower (``call_graph``) currently covers only the
-        non-attention (``attn_layer == 0``) factorizable path with concat
-        type-embedding and no type exclusion. Any other config (attention,
-        ``tebd_input_mode == "strip"``, ``exclude_types``) falls back to the
-        legacy dense path, so those models keep working unchanged.
+        The graph-native lower (``call_graph``) covers the factorizable path
+        AND transformer attention (``attn_layer >= 0``, NeighborGraph PR-D)
+        with concat type-embedding and no type exclusion. Remaining ineligible
+        configs (``tebd_input_mode == "strip"``, ``exclude_types``) fall back
+        to the legacy dense path, so those models keep working unchanged.
         """
         return (
-            self.se_atten.attn_layer == 0
-            and self.se_atten.tebd_input_mode == "concat"
+            self.se_atten.tebd_input_mode == "concat"
             and not self.se_atten.exclude_types
         )
 
@@ -643,6 +642,9 @@ def _call_graph_adapter(
             graph,
             atype_local,
             type_embedding=self.type_embedding.call(),
+            # the adapter graph is shape-static center-major (compact=False):
+            # keep the attention pair enumeration nonzero-free (traceable)
+            static_nnei=nnei,
         )
         # call_graph returns flat (N, ...) node axis; reshape to (nf, nloc, ...)
         # for the dense 5-tuple ABI -- this reshape is LOCAL to the adapter shim.
@@ -727,8 +729,9 @@ def call_graph(
         graph: Any,
         atype: Array,
         type_embedding: Array | None = None,
+        static_nnei: int | None = None,
     ) -> tuple[Array, Array]:
-        """Descriptor-level graph-native forward (``attn_layer == 0``).
+        """Descriptor-level graph-native forward.
 
         Wraps the block kernel
         :meth:`DescrptBlockSeAtten.call_graph`, adds the descriptor-level
@@ -760,7 +763,7 @@ def call_graph(
         xp = array_api_compat.array_namespace(graph.edge_vec)
         dev = array_api_compat.device(graph.edge_vec)
         grrg, rot_mat = self.se_atten.call_graph(
-            graph, atype, type_embedding=type_embedding
+            graph, atype, type_embedding=type_embedding, static_nnei=static_nnei
         )
         # FLAT node axis (N, ...): no (nf, nloc) reshape -- ragged-native, spec.
         if self.concat_output_tebd:
@@ -1670,12 +1673,15 @@ def call_graph(
         graph: Any,
         atype: Array,
         type_embedding: Array | None = None,
+        static_nnei: int | None = None,
     ) -> tuple[Array, Array]:
-        """Graph-native forward (``attn_layer=0`` only).
+        """Graph-native forward.
 
         Bit-exact analogue of :meth:`call` on the SAME neighbor list, with the
         neighbor-axis reduction replaced by a ``segment_sum`` over edge centers
-        (``dst``). Geometry enters only through ``graph.edge_vec``.
+        (``dst``) and the dense ``(nnei, nnei)`` transformer attention replaced
+        by pairs of edges sharing a center (``center_edge_pairs`` +
+        ``segment_softmax``). Geometry enters only through ``graph.edge_vec``.
 
         Parameters
         ----------
@@ -1687,6 +1693,12 @@ def call_graph(
             (N,) flat node atom types (``N = sum(graph.n_node)``).
         type_embedding
             (ntypes_with_padding, tebd_dim) type-embedding table.
+        static_nnei
+            When the graph uses the shape-static center-major layout
+            (``from_dense_quartet(compact=False)``, ``E = n_center * nnei``),
+            pass ``nnei`` so the attention edge-pair enumeration stays
+            jit/export-traceable (no ``nonzero``). ``None`` (carry-all /
+            compact graphs) selects the dynamic eager form.
 
         Returns
         -------
@@ -1699,8 +1711,7 @@ def call_graph(
 
         Notes
         -----
-        Known limitations (NeighborGraph PR-A):
-        - ``attn_layer == 0`` only (attention lands in PR-D);
+        Known limitations:
         - ``tebd_input_mode == "concat"`` only (strip mode lands later);
         - ``exclude_types`` is not yet supported and raises (lands in a later PR).
         """
@@ -1709,11 +1720,6 @@ def call_graph(
             segment_sum,
         )
 
-        if self.attn_layer != 0:
-            raise NotImplementedError(
-                "graph path supports attn_layer=0 only (NeighborGraph PR-A); "
-                "attn_layer>0 lands in PR-D"
-            )
         if self.tebd_input_mode not in ["concat"]:
             raise NotImplementedError(
                 "graph path supports tebd_input_mode='concat' only (NeighborGraph PR-A)"
@@ -1738,7 +1744,7 @@ def call_graph(
         # per-edge env-mat 4-vector, normalized by the center (dst) atom type.
         # self.mean/self.stddev are slot-independent (ntypes, nnei, 4); slot 0 is
         # the canonical per-type vector.
-        rr = edge_env_mat(
+        rr, sw_e = edge_env_mat(
             graph.edge_vec,
             center_type,
             self.mean[:, 0, :],
@@ -1747,7 +1753,8 @@ def call_graph(
             self.rcut_smth,
             protection=self.env_protection,
             edge_mask=graph.edge_mask,
-        )  # (E, 4)
+            return_sw=True,
+        )  # (E, 4), (E, 1) sw zeroed on padding
         # radial channel
         ss = rr[:, 0:1]  # (E, 1)
         # neighbor / center type embeddings (concat mode); ghost type == owner type
@@ -1764,6 +1771,13 @@ def call_graph(
             ss = xp.concat([ss, atype_embd_nlist], axis=-1)
         # embedding net (same weights as the dense path); applies on the last axis
         gg = self.embeddings[0].call(ss)  # (E, ng)
+        # transformer attention over each center's edges — mirrors the dense
+        # self.dpa1_attention(gg, nlist_mask, input_r, sw), which also runs on
+        # the UNMASKED gg (padding rows are neutralized afterwards).
+        if self.attn_layer > 0:
+            gg = self._graph_attention(
+                gg, rr, dst, n_total, graph.edge_mask, sw_e, static_nnei
+            )
         # zero padding/guard edges BEFORE the segment sum
         gg = gg * xp.astype(graph.edge_mask[:, None], gg.dtype)
         # outer product (replaces the dense gg[:,:,:,None] * rr[:,:,None,:])
@@ -1784,6 +1798,138 @@ def call_graph(
         rot_mat = gr[:, :, 1:]
         return grrg, rot_mat
 
+    def _graph_attention(
+        self,
+        gg: Array,
+        rr: Array,
+        dst: Array,
+        n_total: int,
+        edge_mask: Array,
+        sw_e: Array,
+        static_nnei: int | None,
+    ) -> Array:
+        """Graph-native transformer attention over each center's edges.
+
+        Ragged reproduction of :class:`NeighborGatedAttention` /
+        :class:`GatedAttentionLayer`: edges sharing a center attend to each
+        other. The dense ``(nnei, nnei)`` square per center becomes the
+        edge-pair axis from ``center_edge_pairs(ordered=True,
+        include_self=True)``; softmax over the key axis becomes
+        ``segment_softmax`` grouped by the query edge.
+
+        Parameters
+        ----------
+        gg : (E, ng) per-edge embedding (UNMASKED, as in the dense path).
+        rr : (E, 4) per-edge env-mat vector (``rr[:, 1:4]`` carries direction).
+        dst : (E,) center of each edge.
+        n_total : number of centers.
+        edge_mask : (E,) real-vs-padding edge mask.
+        sw_e : (E, 1) smooth switch, zeroed on padding edges.
+        static_nnei : shape-static layout ``nnei`` or ``None`` (compact eager).
+        """
+        from deepmd.dpmodel.utils.neighbor_graph import (
+            center_edge_pairs,
+        )
+
+        xp = array_api_compat.array_namespace(gg)
+        # per-edge normalized direction (mirrors the dense input_r,
+        # rr[..., 1:4] / max(|rr[..., 1:4]|, 1e-12))
+        dir3 = rr[:, 1:4]
+        normed = safe_for_vector_norm(dir3, axis=-1, keepdims=True)
+        input_r = dir3 / xp.maximum(normed, xp.full_like(normed, 1e-12))  # (E, 3)
+        # transformer neighbor-pairs: full ordered square incl. the diagonal
+        # (q_m . k_n is not symmetric and self-attention keeps m == n)
+        q_e, k_e, pair_mask = center_edge_pairs(
+            dst,
+            edge_mask,
+            n_total,
+            include_self=True,
+            ordered=True,
+            static_nnei=static_nnei,
+        )
+        for layer in self.dpa1_attention.attention_layers:
+            gg = self._graph_attention_one_layer(
+                layer, gg, input_r, sw_e, q_e, k_e, pair_mask
+            )
+        return gg
+
+    def _graph_attention_one_layer(
+        self,
+        layer: "NeighborGatedAttentionLayer",
+        gg: Array,
+        input_r: Array,
+        sw_e: Array,
+        q_e: Array,
+        k_e: Array,
+        pair_mask: Array,
+    ) -> Array:
+        """One residual attention layer, op-for-op vs the dense reference.
+
+        Mirrors ``NeighborGatedAttentionLayer.call`` (residual +
+        ``GatedAttentionLayer.call`` + LayerNorm). Structural translation:
+        per-center ``q @ k^T`` -> per-pair ``q_m . k_n``; softmax over the key
+        axis -> ``segment_softmax`` grouped by the query edge. The smooth
+        branch keeps padding pairs IN the softmax denominator with ``sw = 0``
+        (weight ``exp(-attnw_shift)``), exactly like the dense branch, which
+        replaces the ``-inf`` masking by the switch weighting.
+        """
+        from deepmd.dpmodel.utils.neighbor_graph import (
+            segment_softmax,
+            segment_sum,
+        )
+
+        xp = array_api_compat.array_namespace(gg)
+        e_tot = gg.shape[0]
+        gal = layer.attention_layer  # GatedAttentionLayer
+        if gal.num_heads != 1:
+            raise NotImplementedError(
+                "graph attention assumes num_heads == 1 (dpa1 never exposes "
+                "num_heads; the dense head_dim QKV slicing relies on it)"
+            )
+        hd = gal.head_dim  # == hidden_dim for num_heads == 1
+        residual = gg
+        # in_proj -> Q, K, V; mirror the dense HEAD_DIM slicing exactly
+        qkv = gal.in_proj.call(gg)  # (E, 3 * hidden)
+        q = qkv[:, 0:hd]
+        k = qkv[:, hd : hd * 2]
+        v = qkv[:, hd * 2 : hd * 3]
+        if gal.normalize:
+            q = np_normalize(q, axis=-1)
+            k = np_normalize(k, axis=-1)
+            v = np_normalize(v, axis=-1)
+        q = q * gal.scaling
+        # per-pair logits q_m . k_n (num_heads == 1)
+        logits = xp.sum(
+            xp.take(q, q_e, axis=0) * xp.take(k, k_e, axis=0), axis=-1
+        )  # (P,)
+        if gal.smooth:
+            # (logits + shift) * sw_m * sw_n - shift, then softmax WITHOUT the
+            # pair mask: padding pairs stay in the denominator at exp(-shift),
+            # mirroring the dense smooth branch (sw already zeroed on padding).
+            attnw_shift = 20.0  # dense GatedAttentionLayer.call default
+            sw_flat = sw_e[:, 0]  # (E,)
+            sw_q = xp.take(sw_flat, q_e, axis=0)
+            sw_k = xp.take(sw_flat, k_e, axis=0)
+            logits = (logits + attnw_shift) * sw_q * sw_k - attnw_shift
+            w = segment_softmax(logits, q_e, e_tot)  # (P,)
+            w = w * sw_q * sw_k
+        else:
+            # non-smooth: dense masks padding keys to -inf pre-softmax ==
+            # excluding them from the softmax entirely
+            w = segment_softmax(logits, q_e, e_tot, mask=pair_mask)
+        if gal.dotr:
+            angular = xp.sum(
+                xp.take(input_r, q_e, axis=0) * xp.take(input_r, k_e, axis=0),
+                axis=-1,
+            )  # (P,) = input_r_m . input_r_n
+            w = w * angular
+        # o_m = sum_n w[m, n] v[n] -> segment_sum over the query edge
+        wv = w[:, None] * xp.take(v, k_e, axis=0)  # (P, hd)
+        o = segment_sum(wv, q_e, e_tot)  # (E, hd)
+        out = gal.out_proj.call(o)  # (E, ng)
+        x = residual + out
+        return layer.attn_layer_norm.call(x)
+
     def has_message_passing(self) -> bool:
         """Returns whether the descriptor block has message passing."""
         return False
diff --git a/deepmd/dpmodel/utils/neighbor_graph/env.py b/deepmd/dpmodel/utils/neighbor_graph/env.py
index 55bbe1b02f..4057cd8640 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/env.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/env.py
@@ -41,7 +41,8 @@ def edge_env_mat(
     rcut_smth: float,
     protection: float = 0.0,
     edge_mask: Array | None = None,
-) -> Array:
+    return_sw: bool = False,
+) -> Array | tuple[Array, Array]:
     """Compute the per-edge environment-matrix 4-vector.
 
     Mirrors the math in ``_make_env_mat`` / ``EnvMat.call`` (env_mat.py)
@@ -79,6 +80,9 @@ def edge_env_mat(
         (E, 4) normalized environment-matrix vectors.
         Padding edges (``edge_vec = 0``) produce nonzero values but are
         masked by ``NeighborGraph.edge_mask`` downstream.
+        When ``return_sw`` is True, returns ``(em, sw)`` where ``sw`` is the
+        (E, 1) smooth switch, zeroed on padding edges (mirrors the dense
+        ``_make_env_mat`` mask; consumed by the smooth attention branch).
     """
     xp = array_api_compat.array_namespace(edge_vec)
     dev = array_api_compat.device(edge_vec)
@@ -114,4 +118,13 @@ def edge_env_mat(
     avg = xp.take(xp.asarray(davg, device=dev), center_type, axis=0)  # (E, 4)
     std = xp.take(xp.asarray(dstd, device=dev), center_type, axis=0)  # (E, 4)
 
+    if return_sw:
+        # per-edge switch, zeroed on padding edges — mirrors the dense
+        # ``_make_env_mat`` (``weight = weight * mask``); used by the smooth
+        # attention branch.
+        if edge_mask is not None:
+            sw_out = sw * xp.astype(edge_mask[:, None], sw.dtype)
+        else:
+            sw_out = sw
+        return (em - avg) / std, sw_out
     return (em - avg) / std
diff --git a/source/tests/common/dpmodel/test_dpa1_call_graph_block.py b/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
index e8930101dd..9a984a30f3 100644
--- a/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
+++ b/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
@@ -90,11 +90,8 @@ def test_block_graph_equals_dense_any_sel(self, sel, type_one_side) -> None:
             atol=1e-12,
         )
 
-    def test_attn_layer_gt0_raises(self) -> None:
-        """The graph block kernel fail-fasts for attn_layer > 0 (unsupported)."""
-        dd = DescrptDPA1(rcut=4.0, rcut_smth=0.5, sel=[20], ntypes=2, attn_layer=2)
-        with pytest.raises(NotImplementedError):
-            dd.se_atten.call_graph(None, np.array([0], dtype=np.int64))
+    # attn_layer > 0 is supported since NeighborGraph PR-D; parity is covered
+    # by test_dpa1_graph_attention_parity.py (the fail-fast test was removed).
 
     def test_exclude_types_raises(self) -> None:
         """The graph block kernel fail-fasts for exclude_types (not yet applied)."""
diff --git a/source/tests/common/dpmodel/test_dpa1_graph_attention_parity.py b/source/tests/common/dpmodel/test_dpa1_graph_attention_parity.py
new file mode 100644
index 0000000000..d6d8ac76dc
--- /dev/null
+++ b/source/tests/common/dpmodel/test_dpa1_graph_attention_parity.py
@@ -0,0 +1,210 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Graph-native se_atten attention (attn_layer > 0) vs the dense reference.
+
+Regime-1 parity (NeighborGraph PR-D): the graph is built FROM the same dense
+nlist (``from_dense_quartet``), so the neighbor sets are identical and the
+graph attention must reproduce ``GatedAttentionLayer``/``NeighborGatedAttention``
+bit-exactly (CPU rtol 1e-12) for ANY sel — binding or not.
+
+The smooth branch needs the SHAPE-STATIC graph (``compact=False`` +
+``static_nnei``): dense smooth keeps padding slots in the softmax DENOMINATOR
+(weight ``exp(-attnw_shift)`` since ``sw = 0``), so bit-parity requires the
+same padded pairs on the graph side. The compact (carry-all-like) form drops
+padding pairs and is exercised on the non-smooth branch only.
+"""
+
+import numpy as np
+import pytest
+
+from deepmd.dpmodel.descriptor.dpa1 import (
+    DescrptDPA1,
+)
+from deepmd.dpmodel.utils.neighbor_graph import (
+    from_dense_quartet,
+)
+from deepmd.dpmodel.utils.nlist import (
+    extend_input_and_build_neighbor_list,
+)
+
+GLOBAL_SEED = 20260703
+
+
+def _make(
+    attn_layer,
+    dotr=False,
+    smooth=False,
+    normalize=False,
+    temperature=1.0,
+    sel=(20,),
+):
+    # attention `smooth` is wired to smooth_type_embedding (NOT rcut_smth);
+    # pass it explicitly — its default (True) would silently enable the
+    # smooth branch in the non-smooth cases.
+    return DescrptDPA1(
+        rcut=4.0,
+        rcut_smth=0.5,
+        sel=list(sel),
+        ntypes=2,
+        neuron=[6, 12],
+        axis_neuron=2,
+        attn=8,
+        attn_layer=attn_layer,
+        attn_dotr=dotr,
+        attn_mask=False,
+        normalize=normalize,
+        smooth_type_embedding=smooth,
+        temperature=temperature,
+        tebd_input_mode="concat",
+        type_one_side=True,
+        precision="float64",
+        seed=GLOBAL_SEED,
+    )
+
+
+class TestGraphAttentionParity:
+    def setup_method(self) -> None:
+        rng = np.random.default_rng(GLOBAL_SEED)
+        self.nloc = 5
+        self.coord = rng.normal(size=(1, self.nloc, 3)) * 1.5
+        self.atype = np.array([[0, 1, 0, 1, 1]], dtype=np.int64)
+
+    def _quartet(self, dd):
+        return extend_input_and_build_neighbor_list(
+            self.coord,
+            self.atype,
+            dd.get_rcut(),
+            dd.get_sel(),
+            mixed_types=dd.mixed_types(),
+            box=None,
+        )
+
+    def _dense_vs_adapter(self, dd, rtol=1e-12):
+        """Descriptor-level: legacy dense body vs the graph adapter (shape-static)."""
+        ext_coord, ext_atype, mapping, nlist = self._quartet(dd)
+        dense = dd._call_dense(ext_coord, ext_atype, nlist)
+        graph = dd._call_graph_adapter(ext_coord, ext_atype, nlist, mapping)
+        np.testing.assert_allclose(
+            graph[0], dense[0], rtol=rtol, atol=rtol, err_msg="descriptor"
+        )
+        np.testing.assert_allclose(
+            graph[1], dense[1], rtol=rtol, atol=rtol, err_msg="rot_mat"
+        )
+
+    # ── Task 5a/5b/5c/5e: full matrix on the shape-static adapter path ──────
+    @pytest.mark.parametrize("sel", [(20,), (3,)])  # non-binding AND binding
+    @pytest.mark.parametrize("attn_layer", [1, 2])  # single + stacked layers
+    def test_core_layers_sel(self, attn_layer, sel) -> None:
+        dd = _make(attn_layer, sel=sel)
+        self._dense_vs_adapter(dd)
+
+    @pytest.mark.parametrize("normalize", [False, True])  # q/k/v np_normalize
+    @pytest.mark.parametrize("temperature", [None, 1.0])  # scaling source
+    def test_normalize_temperature(self, normalize, temperature) -> None:
+        dd = _make(1, normalize=normalize, temperature=temperature)
+        self._dense_vs_adapter(dd)
+
+    @pytest.mark.parametrize("dotr", [False, True])  # angular weighting
+    @pytest.mark.parametrize("smooth", [False, True])  # switch-fn weighting
+    def test_dotr_smooth(self, dotr, smooth) -> None:
+        dd = _make(2, dotr=dotr, smooth=smooth, normalize=True, temperature=None)
+        self._dense_vs_adapter(dd)
+
+    # ── compact (carry-all-form) graph through the BLOCK kernel, non-smooth ──
+    @pytest.mark.parametrize("attn_layer", [1, 2])  # single + stacked layers
+    def test_block_compact_graph_no_smooth(self, attn_layer) -> None:
+        dd = _make(attn_layer, dotr=True, normalize=True)
+        blk = dd.se_atten
+        ext_coord, ext_atype, mapping, nlist = self._quartet(dd)
+        tebd = dd.type_embedding.call()
+        nf, nall = ext_atype.shape
+        atype_embd_ext = np.reshape(
+            np.take(tebd, np.reshape(ext_atype, (-1,)), axis=0),
+            (nf, nall, dd.tebd_dim),
+        )
+        dense_g, *_ = blk.call(
+            nlist,
+            ext_coord,
+            ext_atype,
+            atype_embd_ext=atype_embd_ext,
+            mapping=None,
+            type_embedding=tebd,
+        )
+        ng = from_dense_quartet(ext_coord, nlist, mapping)  # compact=True
+        graph_g, _ = blk.call_graph(
+            ng, np.reshape(ext_atype, (-1,)), type_embedding=tebd
+        )
+        np.testing.assert_allclose(
+            graph_g.reshape(dense_g.shape), dense_g, rtol=1e-12, atol=1e-12
+        )
+
+    # ── smooth on the compact (carry-all) form: CLEAN DIVERGENCE by design ────
+    def test_block_compact_graph_smooth_clean_divergence(self) -> None:
+        """Carry-all smooth attention deliberately DIVERGES from dense.
+
+        The dense smooth branch keeps sel-padding slots in the attention
+        softmax DENOMINATOR at weight ``exp(-attnw_shift)``, which makes the
+        dense output depend on ``sel`` itself (same physical neighbors,
+        different sel => different output, up to ~1e-4). The carry-all graph
+        drops those phantom terms — the sel-independent math (user decision
+        2026-07-03, PR-D). Bit-parity (1e-12) is proven on the shape-static
+        adapter (same padded pairs on both sides, ``test_dotr_smooth``); here
+        we pin only that the compact form stays CLOSE to dense (the artifact
+        is a bounded denominator perturbation) while NOT bit-equal.
+        """
+        dd = _make(1, smooth=True)
+        blk = dd.se_atten
+        ext_coord, ext_atype, mapping, nlist = self._quartet(dd)
+        tebd = dd.type_embedding.call()
+        nf, nall = ext_atype.shape
+        atype_embd_ext = np.reshape(
+            np.take(tebd, np.reshape(ext_atype, (-1,)), axis=0),
+            (nf, nall, dd.tebd_dim),
+        )
+        dense_g, *_ = blk.call(
+            nlist,
+            ext_coord,
+            ext_atype,
+            atype_embd_ext=atype_embd_ext,
+            mapping=None,
+            type_embedding=tebd,
+        )
+        ng = from_dense_quartet(ext_coord, nlist, mapping)
+        graph_g, _ = blk.call_graph(
+            ng, np.reshape(ext_atype, (-1,)), type_embedding=tebd
+        )
+        graph_g = graph_g.reshape(dense_g.shape)
+        # close (the artifact is a small denominator perturbation) ...
+        np.testing.assert_allclose(graph_g, dense_g, rtol=1e-3, atol=1e-3)
+        # ... but NOT bit-equal: the phantom-padding terms are really gone
+        assert np.max(np.abs(graph_g - dense_g)) > 1e-9
+
+    # ── torch namespace smoke (CLAUDE.md: catches numpy-weight leaks) ────────
+    # NB: the smoke runs the BLOCK kernel with a torch type_embedding table;
+    # the raw dpmodel adapter is numpy-weighted by design (pt_expt wraps it).
+    def test_torch_block_matches_numpy(self) -> None:
+        import torch
+
+        dd = _make(2, dotr=True, smooth=True, normalize=True, temperature=None)
+        blk = dd.se_atten
+        ext_coord, ext_atype, mapping, nlist = self._quartet(dd)
+        tebd = dd.type_embedding.call()
+        ng = from_dense_quartet(ext_coord, nlist, mapping, compact=False)
+        ref, _ = blk.call_graph(
+            ng,
+            np.reshape(ext_atype, (-1,)),
+            type_embedding=tebd,
+            static_nnei=nlist.shape[2],
+        )
+        ng_t = from_dense_quartet(
+            torch.from_numpy(ext_coord),
+            torch.from_numpy(nlist),
+            torch.from_numpy(mapping),
+            compact=False,
+        )
+        out, _ = blk.call_graph(
+            ng_t,
+            torch.from_numpy(np.reshape(ext_atype, (-1,))),
+            type_embedding=torch.from_numpy(tebd),
+            static_nnei=nlist.shape[2],
+        )
+        np.testing.assert_allclose(out.numpy(), ref, rtol=1e-12, atol=1e-12)

From 61285c6d108ca4eeb38fdb44ecfe0913080132ac Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 3 Jul 2026 00:36:08 +0800
Subject: [PATCH 04/38] test(pt_expt): graph attention make_fx (merge gate) +
 model force/virial parity
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- test_make_fx_graph_attn: graph forward + autograd.grad at attn_layer=2
  traces under make_fx for BOTH smooth branches (the shape-static
  center_edge_pairs form is nonzero-free) — required since pt_expt compiled
  training routes eligible models through the graph lower.
- model-level graph-vs-legacy lower parity now parametrized over
  attn_layer {0, 2} (energy/force/virial/atom_virial, 1e-12 CPU).
- eligibility pins: attention+concat is graph-eligible; se_atten_v2
  (tebd_input_mode='strip') correctly stays dense (strip = later PR;
  the plan's 'se_atten_v2 inherits for free' did not hold).
---
 .../test_dpa1_graph_attention_parity.py       | 17 ++++++
 source/tests/pt_expt/descriptor/test_dpa1.py  | 59 +++++++++++++++++++
 .../pt_expt/model/test_dpa1_graph_lower.py    | 15 +++--
 3 files changed, 87 insertions(+), 4 deletions(-)

diff --git a/source/tests/common/dpmodel/test_dpa1_graph_attention_parity.py b/source/tests/common/dpmodel/test_dpa1_graph_attention_parity.py
index d6d8ac76dc..16930f0b41 100644
--- a/source/tests/common/dpmodel/test_dpa1_graph_attention_parity.py
+++ b/source/tests/common/dpmodel/test_dpa1_graph_attention_parity.py
@@ -208,3 +208,20 @@ def test_torch_block_matches_numpy(self) -> None:
             static_nnei=nlist.shape[2],
         )
         np.testing.assert_allclose(out.numpy(), ref, rtol=1e-12, atol=1e-12)
+
+
+class TestGraphEligibility:
+    def test_attention_concat_is_graph_eligible(self) -> None:
+        assert _make(2).uses_graph_lower()
+
+    def test_strip_mode_stays_dense(self) -> None:
+        """se_atten_v2 (tebd_input_mode='strip') is NOT graph-eligible yet:
+        strip-mode graph support is a later PR; it must keep the dense route
+        (the PR-D plan's 'se_atten_v2 inherits for free' did not hold).
+        """
+        from deepmd.dpmodel.descriptor.se_atten_v2 import (
+            DescrptSeAttenV2,
+        )
+
+        dd = DescrptSeAttenV2(rcut=4.0, rcut_smth=0.5, sel=[20], ntypes=2, attn_layer=2)
+        assert not dd.uses_graph_lower()
diff --git a/source/tests/pt_expt/descriptor/test_dpa1.py b/source/tests/pt_expt/descriptor/test_dpa1.py
index d7d2718e67..cddd22419f 100644
--- a/source/tests/pt_expt/descriptor/test_dpa1.py
+++ b/source/tests/pt_expt/descriptor/test_dpa1.py
@@ -311,6 +311,65 @@ def fn(coord_ext, atype_ext, nlist, mapping):
             atol=atol,
         )
 
+    @pytest.mark.parametrize("smooth", [False, True])  # smooth attention branch
+    @pytest.mark.parametrize("prec", ["float64"])  # precision
+    def test_make_fx_graph_attn(self, prec, smooth) -> None:
+        """make_fx (export-readiness) of the GRAPH forward with attention.
+
+        MERGE BLOCKER (NeighborGraph PR-D): pt_expt compiled training routes
+        eligible models through the graph lower by default, so graph attention
+        (``attn_layer > 0``) must be fx-traceable — the shape-static
+        ``center_edge_pairs`` form keeps the pair enumeration ``nonzero``-free.
+        Covers both the smooth and non-smooth attention branches.
+        """
+        rng = np.random.default_rng(GLOBAL_SEED)
+        _, _, nnei = self.nlist.shape
+        davg = rng.normal(size=(self.nt, nnei, 4))
+        dstd = 0.1 + np.abs(rng.normal(size=(self.nt, nnei, 4)))
+
+        dtype = PRECISION_DICT[prec]
+        rtol, atol = get_tols(prec)
+        dd0 = DescrptDPA1(
+            self.rcut,
+            self.rcut_smth,
+            self.sel_mix,
+            self.nt,
+            attn_layer=2,
+            attn_dotr=True,
+            smooth_type_embedding=smooth,
+            precision=prec,
+            seed=GLOBAL_SEED,
+        ).to(self.device)
+        dd0.se_atten.mean = torch.tensor(davg, dtype=dtype, device=self.device)
+        dd0.se_atten.stddev = torch.tensor(dstd, dtype=dtype, device=self.device)
+        dd0 = dd0.eval()
+        coord_ext = torch.tensor(self.coord_ext, dtype=dtype, device=self.device)
+        atype_ext = torch.tensor(self.atype_ext, dtype=int, device=self.device)
+        nlist = torch.tensor(self.nlist, dtype=int, device=self.device)
+        mapping = torch.tensor(self.mapping, dtype=int, device=self.device)
+
+        def fn(coord_ext, atype_ext, nlist, mapping):
+            coord_ext = coord_ext.detach().requires_grad_(True)
+            rd = dd0(coord_ext, atype_ext, nlist, mapping)[0]
+            grad = torch.autograd.grad(rd.sum(), coord_ext, create_graph=False)[0]
+            return rd, grad
+
+        rd_eager, grad_eager = fn(coord_ext, atype_ext, nlist, mapping)
+        traced = make_fx(fn)(coord_ext, atype_ext, nlist, mapping)
+        rd_traced, grad_traced = traced(coord_ext, atype_ext, nlist, mapping)
+        np.testing.assert_allclose(
+            rd_eager.detach().cpu().numpy(),
+            rd_traced.detach().cpu().numpy(),
+            rtol=rtol,
+            atol=atol,
+        )
+        np.testing.assert_allclose(
+            grad_eager.detach().cpu().numpy(),
+            grad_traced.detach().cpu().numpy(),
+            rtol=rtol,
+            atol=atol,
+        )
+
     @pytest.mark.parametrize("shared_level", [0, 1])  # sharing level
     def test_share_params(self, shared_level) -> None:
         """share_params level 0: share all; level 1: share type_embedding only."""
diff --git a/source/tests/pt_expt/model/test_dpa1_graph_lower.py b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
index e274a1bcec..d6cbb51f2f 100644
--- a/source/tests/pt_expt/model/test_dpa1_graph_lower.py
+++ b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
@@ -91,7 +91,7 @@ def setup_method(self) -> None:
             [[0, 0, 0, 1, 1]], dtype=torch.int64, device=self.device
         )
 
-    def _make_model(self) -> EnergyModel:
+    def _make_model(self, attn_layer: int = 0, smooth: bool = False) -> EnergyModel:
         ds = DescrptDPA1(
             self.rcut,
             self.rcut_smth,
@@ -100,9 +100,13 @@ def _make_model(self) -> EnergyModel:
             neuron=[3, 6],
             axis_neuron=2,
             attn=4,
-            attn_layer=0,  # graph lower only supports attn_layer == 0
+            attn_layer=attn_layer,
             attn_dotr=True,
             attn_mask=False,
+            # smooth attention keeps sel-padding in the dense softmax
+            # denominator; the carry-all graph drops it BY DESIGN (PR-D), so
+            # exact graph-vs-dense parity requires smooth=False here.
+            smooth_type_embedding=smooth,
             activation_function="tanh",
             set_davg_zero=False,
             type_one_side=True,
@@ -165,13 +169,16 @@ def _prepare_lower_inputs(self, periodic: bool):
         mapping_t = torch.tensor(mapping, dtype=torch.int64, device=self.device)
         return ext_coord, ext_atype, nlist_t, mapping_t
 
+    @pytest.mark.parametrize("attn_layer", [0, 2])  # factorizable AND attention
     @pytest.mark.parametrize("periodic", [True, False])  # PBC vs non-PBC
     @pytest.mark.parametrize("do_av", [False, True])  # atom-virial off / on
-    def test_force_virial_parity_vs_legacy(self, periodic, do_av) -> None:
+    def test_force_virial_parity_vs_legacy(self, periodic, do_av, attn_layer) -> None:
         """Graph lower energy/force/virial/atom_virial == legacy dense lower on
         the SAME neighbor set (regime-1 graph from from_dense_quartet).
+        attn_layer=2 exercises graph attention through model-level autograd
+        (smooth=False: exact carry-all parity regime, NeighborGraph PR-D).
         """
-        model = self._make_model()
+        model = self._make_model(attn_layer=attn_layer)
         model.eval()
         tol = (
             {"rtol": 1e-12, "atol": 1e-12}

From 298518e06403f994b88101b961c5879b896202b6 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 3 Jul 2026 00:46:37 +0800
Subject: [PATCH 05/38] test: binding-sel audit for graph-default attention
 models
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- linear-model weight tests: pin smooth_type_embedding=False — the standard
  (graph-routed, carry-all) and linear (graph-ineligible, dense) submodels
  otherwise differ by the accepted smooth-attention denominator divergence
  (~1e-6), which is a route artifact, not a weight-combination bug.
- new binding-sel sanity: carry-all graph attention diverges from the
  sel-truncated dense path when sel binds (spec decision #17).
---
 .../test_dpa1_graph_attention_parity.py       | 42 +++++++++++++++++++
 .../tests/pt_expt/model/test_linear_model.py  |  6 +++
 2 files changed, 48 insertions(+)

diff --git a/source/tests/common/dpmodel/test_dpa1_graph_attention_parity.py b/source/tests/common/dpmodel/test_dpa1_graph_attention_parity.py
index 16930f0b41..97d044862f 100644
--- a/source/tests/common/dpmodel/test_dpa1_graph_attention_parity.py
+++ b/source/tests/common/dpmodel/test_dpa1_graph_attention_parity.py
@@ -225,3 +225,45 @@ def test_strip_mode_stays_dense(self) -> None:
 
         dd = DescrptSeAttenV2(rcut=4.0, rcut_smth=0.5, sel=[20], ntypes=2, attn_layer=2)
         assert not dd.uses_graph_lower()
+
+
+class TestBindingSelDivergence:
+    """At BINDING sel the carry-all graph attends over MORE neighbors than the
+    sel-truncated dense path — outputs must differ (sanity, not parity;
+    spec decision #17).
+    """
+
+    def test_carry_all_attention_differs_at_binding_sel(self) -> None:
+        from deepmd.dpmodel.utils.neighbor_graph import (
+            build_neighbor_graph,
+        )
+
+        rng = np.random.default_rng(GLOBAL_SEED)
+        nloc = 6
+        coord = rng.random((1, nloc, 3)) * 2.0  # dense blob => binding sel=2
+        atype = np.array([[0, 1, 0, 1, 1, 0]], dtype=np.int64)
+        dd = _make(2, dotr=True, sel=(2,))
+        ext_coord, ext_atype, mapping, nlist = extend_input_and_build_neighbor_list(
+            coord, atype, dd.get_rcut(), dd.get_sel(), mixed_types=True, box=None
+        )
+        assert (nlist >= 0).all(), "fixture must be sel-binding (all slots full)"
+        tebd = dd.type_embedding.call()
+        atype_embd_ext = np.reshape(
+            np.take(tebd, np.reshape(ext_atype, (-1,)), axis=0),
+            (1, ext_atype.shape[1], dd.tebd_dim),
+        )
+        dense_g, *_ = dd.se_atten.call(
+            nlist,
+            ext_coord,
+            ext_atype,
+            atype_embd_ext=atype_embd_ext,
+            mapping=None,
+            type_embedding=tebd,
+        )
+        graph = build_neighbor_graph(coord, atype, None, dd.get_rcut())
+        graph_g, _ = dd.se_atten.call_graph(
+            graph, atype.reshape(-1), type_embedding=tebd
+        )
+        assert np.max(np.abs(graph_g.reshape(dense_g.shape) - dense_g)) > 1e-6, (
+            "carry-all attention must diverge from sel-truncated dense"
+        )
diff --git a/source/tests/pt_expt/model/test_linear_model.py b/source/tests/pt_expt/model/test_linear_model.py
index a18cabd9e1..8c95f3e69e 100644
--- a/source/tests/pt_expt/model/test_linear_model.py
+++ b/source/tests/pt_expt/model/test_linear_model.py
@@ -343,6 +343,12 @@ def test_forward_lower_exportable(self) -> None:
         "temperature": 1.0,
         "set_davg_zero": True,
         "type_one_side": True,
+        # smooth attention diverges between the graph default (standard model,
+        # carry-all: no phantom sel-padding softmax terms) and the dense route
+        # (linear models are graph-ineligible) by design (NeighborGraph PR-D);
+        # pin smooth off so both routes are exact and the weight-combination
+        # comparison stays at 1e-10.
+        "smooth_type_embedding": False,
         "seed": 1,
     },
     "fitting_net": {

From ccefbbc353ee330357b6a4434388797cf8cb93fc Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 3 Jul 2026 01:19:46 +0800
Subject: [PATCH 06/38] test(pt_expt): pin smooth off in neighbor-list dpa1
 fixture (route parity)

neighbor_list=None now takes the carry-all graph default for eligible
attention models; explicit World-1 builders take the legacy dense route.
With smooth attention the two routes differ by design (PR-D), so the
route-equivalence tests pin smooth_type_embedding=False.
---
 source/tests/pt_expt/utils/test_neighbor_list.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/source/tests/pt_expt/utils/test_neighbor_list.py b/source/tests/pt_expt/utils/test_neighbor_list.py
index 26bf5c19da..ff29ed50c4 100644
--- a/source/tests/pt_expt/utils/test_neighbor_list.py
+++ b/source/tests/pt_expt/utils/test_neighbor_list.py
@@ -113,6 +113,11 @@
         "temperature": 1.0,
         "set_davg_zero": True,
         "type_one_side": True,
+        # smooth attention diverges between the carry-all graph default
+        # (neighbor_list=None) and the explicit World-1 builders by design
+        # (NeighborGraph PR-D: dense keeps sel-padding in the attention
+        # softmax denominator); pin smooth off so all routes are exact.
+        "smooth_type_embedding": False,
         "seed": 1,
     },
     "fitting_net": {"neuron": [8, 8], "resnet_dt": True, "seed": 1},

From 1c28aa5b6959b7e8c410bb1dba6c190d0ee8757e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 2 Jul 2026 17:35:25 +0000
Subject: [PATCH 07/38] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 deepmd/dpmodel/utils/neighbor_graph/pairs.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/deepmd/dpmodel/utils/neighbor_graph/pairs.py b/deepmd/dpmodel/utils/neighbor_graph/pairs.py
index 3a8a5b6299..f8614d821b 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/pairs.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/pairs.py
@@ -24,7 +24,9 @@
 ``E ~ N * nnei`` it costs ``O(N**2 * nnei**2)`` memory.
 """
 
-from __future__ import annotations
+from __future__ import (
+    annotations,
+)
 
 from typing import (
     Any,

From 9ab2f3810424bd53b35ecd57903009ba19636550 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 3 Jul 2026 14:55:30 +0800
Subject: [PATCH 08/38] feat(dpmodel): make compact center_edge_pairs traceable
 via unbacked SymInts

The compact (carry-all) pair enumeration used nonzero + tensor-repeat with
Python control flow on their data-dependent sizes, so the attention graph
lower failed torch.export with GuardOnDataDependentSymNode. Register those
sizes as unbacked SymInt sizes (new torch-free xp_hint_dynamic_size shim,
no-op for numpy/jax), take the empty-input fast paths only on concrete int
shapes, build iotas via cumsum(ones)-1 (the array_api_compat arange wrapper
branches on the length in Python), and skip the policy-compression nonzero
when no filter applies (include_self and ordered - the attention default).
Eager numpy/torch results are unchanged.
---
 deepmd/dpmodel/array_api.py                  | 19 +++++++++
 deepmd/dpmodel/utils/neighbor_graph/pairs.py | 44 +++++++++++++++-----
 2 files changed, 53 insertions(+), 10 deletions(-)

diff --git a/deepmd/dpmodel/array_api.py b/deepmd/dpmodel/array_api.py
index 094d4dfd6f..115242edfb 100644
--- a/deepmd/dpmodel/array_api.py
+++ b/deepmd/dpmodel/array_api.py
@@ -211,6 +211,25 @@ def xp_add_at(x: Array, indices: Array, values: Array) -> Array:
         return x
 
 
+def xp_hint_dynamic_size(x: Array) -> None:
+    """Mark a data-dependent leading dimension as a valid size for torch.export.
+
+    Under symbolic tracing (``make_fx`` / ``torch.export``) the length of a
+    data-dependent array (e.g. the output of ``nonzero`` or a tensor-``repeat``)
+    is an UNBACKED SymInt; guarding Python control flow or allocations on it
+    raises ``GuardOnDataDependentSymNode``. ``torch._check_is_size`` registers
+    the ``>= 0`` size hint that lets the tracer treat it as a proper dimension
+    (recorded as a ``sym_constrain_range_for_size`` node, preserved by AOTI).
+
+    No-op for numpy / jax / eager-torch concrete shapes — safe to call
+    unconditionally from dpmodel code (torch imported lazily, torch arrays only).
+    """
+    if array_api_compat.is_torch_array(x):
+        import torch
+
+        torch._check_is_size(x.shape[0])
+
+
 def xp_maximum_at(x: Array, indices: Array, values: Array) -> Array:
     """Segment max-assign of values into x at the specified indices.
 
diff --git a/deepmd/dpmodel/utils/neighbor_graph/pairs.py b/deepmd/dpmodel/utils/neighbor_graph/pairs.py
index f8614d821b..75f2682f20 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/pairs.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/pairs.py
@@ -8,17 +8,23 @@
 
 Two forms:
 
-- **compact eager** (``static_nnei=None``): segment-based enumeration over the
+- **compact** (``static_nnei=None``): segment-based enumeration over the
   real edges only — sort edge ids by center, expand each center's Cartesian
   square via cumsum offsets. Dynamic ``P = sum(deg**2)``; memory ``O(P)``
-  (same order as dense attention's ``O(nloc * nnei**2)``). Uses data-dependent
-  shapes (``nonzero``) so it is EAGER-ONLY.
+  (same order as dense attention's ``O(nloc * nnei**2)``). The data-dependent
+  sizes (``nonzero`` output, tensor-``repeat`` output) are registered as
+  UNBACKED SymInt sizes via :func:`xp_hint_dynamic_size`, so the form traces
+  through ``make_fx``/``torch.export`` and compiles under AOTI (torch >= 2.6
+  unbacked-symint support) — this is what makes the carry-all attention
+  graph lower exportable to a ``.pt2``. numpy/jax run it eagerly as before
+  (jax.jit would still need a static realization — deferred to the jax PR).
 - **shape-static** (``static_nnei`` set): assumes the center-major static
   layout (``E = n_center * static_nnei``, edge ``c * static_nnei + m`` belongs
   to center ``c`` — the layout ``from_dense_quartet(compact=False)`` emits).
   Pure arange/reshape arithmetic, ``P = n_center * static_nnei**2`` with all
   pairs materialized and validity carried by ``pair_mask`` — no data-dependent
-  ops, so it stays jit/export/make_fx-traceable.
+  ops at all, so it traces with only BACKED symbolic shapes (bit-parity with
+  the dense layout; used by the dense-quartet adapter).
 
 A global ``(E, E)`` same-center boolean is deliberately NOT used: with
 ``E ~ N * nnei`` it costs ``O(N**2 * nnei**2)`` memory.
@@ -37,6 +43,7 @@
 from deepmd.dpmodel.array_api import (
     Array,
     xp_add_at,
+    xp_hint_dynamic_size,
 )
 
 
@@ -131,13 +138,19 @@ def _pairs_compact(
         xp.zeros((0,), dtype=xp.int64, device=dev),
         xp.zeros((0,), dtype=xp.bool, device=dev),
     )
-    if dst.shape[0] == 0:
+    # early-return fast paths ONLY when the shape is a concrete Python int:
+    # under make_fx/torch.export symbolic tracing shape[0] is a SymInt (the
+    # nonzero output size is UNBACKED), and branching on it raises
+    # GuardOnDataDependentSymNode. The general code below handles the empty
+    # case correctly anyway (all downstream arrays come out empty).
+    if isinstance(dst.shape[0], int) and dst.shape[0] == 0:
         return empty
     # real edges only, grouped by center (stable sort keeps original order
     # within a center — irrelevant for correctness, deterministic for tests)
     (real_idx,) = xp.nonzero(edge_mask)
+    xp_hint_dynamic_size(real_idx)  # unbacked size R: register >= 0 size hint
     r_tot = real_idx.shape[0]
-    if r_tot == 0:
+    if isinstance(r_tot, int) and r_tot == 0:
         return empty
     d_real = xp.take(dst, real_idx, axis=0)
     order = xp.argsort(d_real, stable=True)
@@ -151,23 +164,34 @@ def _pairs_compact(
     csum = xp.cumulative_sum(counts)
     start = csum - counts  # (n_total,) group start per center
     deg = xp.take(counts, ds, axis=0)  # (R,) degree of each edge's center
+    # iota over the (unbacked-size) R and P axes via cumsum(ones) - 1 instead
+    # of xp.arange: the array_api_compat arange wrapper branches on the length
+    # in Python, which raises GuardOnDataDependentSymNode for unbacked SymInts.
+    iota_r = xp.cumulative_sum(ones) - 1  # (R,) = arange(r_tot)
     # each sorted edge t emits deg[t] pairs; P = sum(deg**2)
-    query_sorted = xp.repeat(xp.arange(r_tot, dtype=xp.int64, device=dev), deg)  # (P,)
+    query_sorted = xp.repeat(iota_r, deg)  # (P,)
+    xp_hint_dynamic_size(query_sorted)  # unbacked size P = sum(deg**2)
     # within each query's block, a 0..deg-1 ramp indexes the key group
     pair_off = xp.cumulative_sum(deg) - deg  # (R,) exclusive prefix of deg
     p_tot = query_sorted.shape[0]
-    ramp = xp.arange(p_tot, dtype=xp.int64, device=dev) - xp.take(
-        pair_off, query_sorted, axis=0
-    )
+    iota_p = xp.cumulative_sum(xp.ones_like(query_sorted)) - 1  # (P,) = arange(p_tot)
+    ramp = iota_p - xp.take(pair_off, query_sorted, axis=0)
     key_sorted = xp.take(start, xp.take(ds, query_sorted, axis=0), axis=0) + ramp
     query_edge = xp.take(eid, query_sorted, axis=0)
     key_edge = xp.take(eid, key_sorted, axis=0)
+    if include_self and ordered:
+        # no policy filter (the attention default): every enumerated pair is
+        # real. Skipping the compression nonzero here keeps the attention
+        # graph-lower traceable with a single unbacked size (P).
+        pair_mask = xp.ones((p_tot,), dtype=xp.bool, device=dev)
+        return query_edge, key_edge, pair_mask
     keep = xp.ones((p_tot,), dtype=xp.bool, device=dev)
     if not include_self:
         keep = keep & (query_edge != key_edge)
     if not ordered:
         keep = keep & (key_edge >= query_edge)
     (kept,) = xp.nonzero(keep)
+    xp_hint_dynamic_size(kept)  # unbacked size: policy-filtered pair count
     query_edge = xp.take(query_edge, kept, axis=0)
     key_edge = xp.take(key_edge, kept, axis=0)
     pair_mask = xp.ones((query_edge.shape[0],), dtype=xp.bool, device=dev)

From 397c40528fc31b46076ed117e1f8a186f6d3f267 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 3 Jul 2026 14:55:30 +0800
Subject: [PATCH 09/38] feat(pt_expt): graph-form .pt2 export for dpa1
 attention (attn_layer > 0)

With the compact pair enumeration unbacked-SymInt-traceable, the carry-all
attention graph lower now exports to a graph-form .pt2 unchanged in ABI
(same 5-tensor NeighborGraph schema, dynamic edge axis) and with carry-all
semantics preserved (no sel truncation, unlike the dense-adapter nlist-form
export). Update the stale freeze-gate message (attention is eligible), add a
symbolic-trace merge gate at attn_layer in {0,2}, parametrize the DeepEval
graph .pt2 fixture over attn_layer (both artifacts: dynamic sizes, PBC and
non-PBC, 1e-10 vs the sel-capped dense reference at non-binding sel), and
add a single-atom zero-real-edge runtime test (the R==0 extreme of the
unbacked sizes).
---
 deepmd/pt_expt/entrypoints/main.py            | 14 ++---
 .../pt_expt/infer/test_graph_deepeval.py      | 43 ++++++++++++---
 .../pt_expt/model/test_dpa1_graph_lower.py    | 53 +++++++++++++++++++
 3 files changed, 98 insertions(+), 12 deletions(-)

diff --git a/deepmd/pt_expt/entrypoints/main.py b/deepmd/pt_expt/entrypoints/main.py
index d21547fbef..465e76bc45 100644
--- a/deepmd/pt_expt/entrypoints/main.py
+++ b/deepmd/pt_expt/entrypoints/main.py
@@ -563,10 +563,12 @@ def freeze(
 
     m.eval()
 
-    # The graph lower is opt-in and only valid for graph-eligible models (dpa1
-    # attn_layer==0 today). Fail fast with a clear message rather than emitting a
-    # broken .pt2. Enable the per-atom virial for the graph form -- it is
-    # near-free there (one extra scatter off the single shared backward).
+    # The graph lower is opt-in and only valid for graph-eligible models
+    # (dpa1 with concat tebd and no type exclusion; attention layers included
+    # -- the carry-all pair enumeration exports via unbacked SymInts). Fail
+    # fast with a clear message rather than emitting a broken .pt2. Enable the
+    # per-atom virial for the graph form -- it is near-free there (one extra
+    # scatter off the single shared backward).
     do_atomic_virial = False
     if lower_kind == "graph":
         from deepmd.pt_expt.train.training import (
@@ -577,8 +579,8 @@ def freeze(
             raise ValueError(
                 "lower_kind='graph' requires a graph-eligible model "
                 "(mixed_types and a descriptor exposing uses_graph_lower()==True, "
-                "currently dpa1 with attn_layer==0). Use lower_kind='nlist' for "
-                "this model."
+                "currently dpa1 with tebd_input_mode='concat' and no "
+                "exclude_types). Use lower_kind='nlist' for this model."
             )
         do_atomic_virial = True
 
diff --git a/source/tests/pt_expt/infer/test_graph_deepeval.py b/source/tests/pt_expt/infer/test_graph_deepeval.py
index 4e7b929391..07649d9e77 100644
--- a/source/tests/pt_expt/infer/test_graph_deepeval.py
+++ b/source/tests/pt_expt/infer/test_graph_deepeval.py
@@ -153,11 +153,18 @@ def _eager_dense_reference(
     return {k: v.detach().cpu().numpy() for k, v in out.items()}
 
 
-@pytest.fixture(scope="module")
-def graph_pt2():
-    """Build a dpa1(attn_layer=0) model and export it to a graph-form ``.pt2``.
-
-    The AOTI compile is slow (~90 s), so it is done once per module.  The eager
+@pytest.fixture(scope="module", params=[0, 2], ids=["attn0", "attn2"])
+def graph_pt2(request):
+    """Build a dpa1 model and export it to a graph-form ``.pt2``.
+
+    Parametrized over ``attn_layer``: 0 exercises the factorizable graph lower;
+    2 exercises the carry-all ATTENTION graph lower, whose compact pair
+    enumeration exports via unbacked SymInts (``xp_hint_dynamic_size``).
+    ``smooth_type_embedding`` stays False: the smooth dense reference keeps
+    sel-padding in its softmax denominator, so dense==carry-all parity holds
+    only for the non-smooth branch (PR-D divergence decision).
+
+    The AOTI compile is slow (~90 s), so it is done once per param.  The eager
     pt_expt model is returned alongside the archive path to serve as the dense
     parity reference.
     """
@@ -165,7 +172,10 @@ def graph_pt2():
         get_model,
     )
 
-    model = get_model(copy.deepcopy(DPA1_CONFIG)).to(torch.float64)
+    config = copy.deepcopy(DPA1_CONFIG)
+    config["descriptor"]["attn_layer"] = request.param
+    config["descriptor"]["smooth_type_embedding"] = False
+    model = get_model(config).to(torch.float64)
     model.eval()
     data = {"model": model.serialize()}
 
@@ -264,3 +274,24 @@ def test_graph_pt2_deepeval_vesin_matches_dense(graph_pt2, pbc) -> None:
     np.testing.assert_allclose(e_v, e_d, rtol=1e-10, atol=1e-10, err_msg="energy")
     np.testing.assert_allclose(f_v, f_d, rtol=1e-10, atol=1e-10, err_msg="force")
     np.testing.assert_allclose(v_v, v_d, rtol=1e-10, atol=1e-10, err_msg="virial")
+
+
+def test_graph_pt2_single_atom_no_edges(graph_pt2) -> None:
+    """A single isolated atom (zero real edges) evaluates through the ``.pt2``.
+
+    The graph builder emits only masked guard edges here, so at runtime the
+    compact pair enumeration sees ``R == 0`` real edges — the empty extreme of
+    the unbacked-SymInt sizes the attention export carries.  Energy must match
+    the eager dense reference and the force must be (numerically) zero.
+    """
+    pt2_path, model = graph_pt2
+    coords = np.array([[[9.0, 9.0, 9.0]]])
+    atype = np.array([0], dtype=np.int32)
+
+    dp = DeepPot(pt2_path)
+    e, f, v = dp.eval(coords, None, atype)[:3]
+    ref = _eager_dense_reference(model, coords, None, atype)
+    np.testing.assert_allclose(
+        e.reshape(-1), ref["energy"].reshape(-1), rtol=1e-10, atol=1e-10
+    )
+    np.testing.assert_allclose(f.reshape(-1), 0.0, atol=1e-12)
diff --git a/source/tests/pt_expt/model/test_dpa1_graph_lower.py b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
index d6cbb51f2f..cf221a3199 100644
--- a/source/tests/pt_expt/model/test_dpa1_graph_lower.py
+++ b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
@@ -237,3 +237,56 @@ def test_force_virial_parity_vs_legacy(self, periodic, do_av, attn_layer) -> Non
             )
             graph_av = graph["energy_derv_c"].reshape(nf, nloc, 1, 9)
             torch.testing.assert_close(graph_av, legacy_av_local, **tol)
+
+    @pytest.mark.parametrize("attn_layer", [0, 2])  # factorizable AND attention
+    def test_graph_lower_symbolic_trace(self, attn_layer) -> None:
+        """``forward_lower_graph_exportable`` traces symbolically for BOTH the
+        factorizable (attn_layer=0) and attention (attn_layer=2) graph lowers,
+        and the traced module reproduces the eager graph lower bit-tight.
+
+        attn_layer > 0 exercises the carry-all compact pair enumeration
+        (``center_edge_pairs`` with ``static_nnei=None``) under make_fx
+        symbolic tracing: its ``nonzero``/tensor-``repeat`` output sizes are
+        UNBACKED SymInts, registered via ``xp_hint_dynamic_size`` — the
+        mechanism that makes the attention graph lower ``.pt2``-exportable.
+        """
+        from deepmd.pt_expt.utils.serialization import (
+            build_synthetic_graph_inputs,
+        )
+
+        model = self._make_model(attn_layer=attn_layer)
+        model.eval()
+        sample = build_synthetic_graph_inputs(
+            model,
+            e_max=175,
+            nframes=2,
+            nloc=7,
+            dtype=torch.float64,
+            device=torch.device("cpu"),
+        )
+        atype, n_node, ei, ev, em, fp, ap, cs = sample
+        traced = model.forward_lower_graph_exportable(
+            atype,
+            n_node,
+            ei,
+            ev,
+            em,
+            fparam=fp,
+            aparam=ap,
+            do_atomic_virial=True,
+            charge_spin=cs,
+            tracing_mode="symbolic",
+            _allow_non_fake_inputs=True,
+        )
+        out = traced(atype, n_node, ei, ev, em, fp, ap, cs)
+        ref = model.forward_common_lower_graph(
+            atype, n_node, ei, ev, em, fparam=fp, aparam=ap, do_atomic_virial=True
+        )
+        tol = {"rtol": 1e-12, "atol": 1e-12}
+        torch.testing.assert_close(out["energy"], ref["energy_redu"], **tol)
+        torch.testing.assert_close(
+            out["force"], ref["energy_derv_r"].reshape(out["force"].shape), **tol
+        )
+        torch.testing.assert_close(
+            out["virial"], ref["energy_derv_c_redu"].reshape(out["virial"].shape), **tol
+        )

From 67d140d0d1f218d4a2e0390d3da82245fa51ddb4 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 3 Jul 2026 15:59:00 +0800
Subject: [PATCH 10/38] fix(dpmodel): cast edge_vec to descriptor precision in
 dpa1 call_graph

The dense call is wrapped in @cast_precision, but the graph route's only
float input (edge_vec) lives inside the NeighborGraph dataclass where the
decorator cannot see it, so non-global-precision models (e.g. float32)
crashed with a double-vs-float matmul on the graph route while the dense
route worked. Cast edge_vec down to the descriptor precision on entry and
the outputs back to the caller's dtype on exit (differentiable, so the
model-level force autograd is unaffected). Add an fp32 graph-vs-dense route
parity test at attn_layer 0 and 2.
---
 deepmd/dpmodel/descriptor/dpa1.py             | 15 +++++
 .../pt_expt/model/test_dpa1_graph_lower.py    | 57 +++++++++++++++++++
 2 files changed, 72 insertions(+)

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index b498d3d296..98259a5731 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -26,6 +26,7 @@
 )
 from deepmd.dpmodel.common import (
     cast_precision,
+    get_xp_precision,
     to_numpy_array,
 )
 from deepmd.dpmodel.utils import (
@@ -760,8 +761,19 @@ def call_graph(
             (N, ng, 3) equivariant single-particle representation, flat node
             axis.
         """
+        import dataclasses
+
         xp = array_api_compat.array_namespace(graph.edge_vec)
         dev = array_api_compat.device(graph.edge_vec)
+        # manual @cast_precision: the decorator casts array ARGUMENTS, but the
+        # graph's only float input (edge_vec) is inside the NeighborGraph
+        # dataclass, invisible to it. Cast edge_vec down to the descriptor
+        # precision on entry and the outputs back to the caller's dtype on
+        # exit (differentiable: grad still flows to the caller's edge_vec leaf).
+        in_dtype = graph.edge_vec.dtype
+        prec = get_xp_precision(xp, self.precision)
+        if in_dtype != prec:
+            graph = dataclasses.replace(graph, edge_vec=xp.astype(graph.edge_vec, prec))
         grrg, rot_mat = self.se_atten.call_graph(
             graph, atype, type_embedding=type_embedding, static_nnei=static_nnei
         )
@@ -775,6 +787,9 @@ def call_graph(
             atype_local = xp.asarray(atype, device=dev)
             atype_embd = xp.take(type_embedding, atype_local, axis=0)  # (N, tebd_dim)
             grrg = xp.concat([grrg, atype_embd], axis=-1)
+        if in_dtype != prec:
+            grrg = xp.astype(grrg, in_dtype)
+            rot_mat = xp.astype(rot_mat, in_dtype)
         return grrg, rot_mat
 
     def enable_compression(
diff --git a/source/tests/pt_expt/model/test_dpa1_graph_lower.py b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
index cf221a3199..5dc98b4784 100644
--- a/source/tests/pt_expt/model/test_dpa1_graph_lower.py
+++ b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
@@ -290,3 +290,60 @@ def test_graph_lower_symbolic_trace(self, attn_layer) -> None:
         torch.testing.assert_close(
             out["virial"], ref["energy_derv_c_redu"].reshape(out["virial"].shape), **tol
         )
+
+    @pytest.mark.parametrize("attn_layer", [0, 2])  # factorizable AND attention
+    def test_graph_route_float32(self, attn_layer) -> None:
+        """A float32 model runs the graph route and matches the dense route.
+
+        The descriptor-level ``call_graph`` casts ``edge_vec`` to the
+        descriptor precision manually (``@cast_precision`` cannot see inside
+        the NeighborGraph dataclass); without it, fp32 models crash with a
+        double-vs-float matmul on the graph route while the dense route works.
+        fp32 accumulation-order differences bound the tolerance (1e-6/1e-5),
+        per the fp32-computation guidance.
+        """
+        from deepmd.pt_expt.descriptor.dpa1 import DescrptDPA1 as _D
+        from deepmd.pt_expt.fitting import InvarFitting as _F
+
+        ds = _D(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+            self.nt,
+            neuron=[3, 6],
+            axis_neuron=2,
+            attn=4,
+            attn_layer=attn_layer,
+            attn_dotr=True,
+            smooth_type_embedding=False,
+            precision="float32",
+            seed=GLOBAL_SEED,
+        ).to(self.device)
+        ft = _F(
+            "energy",
+            self.nt,
+            ds.get_dim_out(),
+            1,
+            mixed_types=True,
+            precision="float32",
+            seed=GLOBAL_SEED,
+        ).to(self.device)
+        model = EnergyModel(ds, ft, type_map=self.type_map).to(self.device)
+        model.eval()
+        graph = model.call_common(
+            self.coord.clone().requires_grad_(True),
+            self.atype,
+            self.cell.reshape(1, 9),
+            neighbor_graph_method="dense",
+        )
+        dense = model.call_common(
+            self.coord.clone().requires_grad_(True),
+            self.atype,
+            self.cell.reshape(1, 9),
+            neighbor_graph_method="legacy",
+        )
+        tol = {"rtol": 1e-5, "atol": 1e-6}
+        torch.testing.assert_close(graph["energy_redu"], dense["energy_redu"], **tol)
+        torch.testing.assert_close(
+            graph["energy_derv_r"], dense["energy_derv_r"], **tol
+        )

From bf09b85e7bef14b98a3c6d2370da2a9341c00838 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 3 Jul 2026 17:20:55 +0800
Subject: [PATCH 11/38] fix(dpmodel): shift masked values in segment_softmax to
 prevent segment-wide NaN

A masked entry whose raw logit exceeds the unmasked per-segment max by more
than the exp overflow threshold (~709 fp64 / ~88 fp32) overflowed exp() to
inf, and the post-hoc inf * 0 mask multiply produced nan, which the
denominator sum then spread across the entire segment. Shift data_for_max
(masked entries already -inf, exp(-inf) == 0 exactly) instead of the raw
data; the mask multiply stays as a defensive no-op. Regression test with a
masked logit 1e5 above the unmasked max. Addresses CodeRabbit review.
---
 deepmd/dpmodel/utils/neighbor_graph/segment.py |  9 ++++++++-
 .../common/dpmodel/test_segment_softmax.py     | 18 ++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/deepmd/dpmodel/utils/neighbor_graph/segment.py b/deepmd/dpmodel/utils/neighbor_graph/segment.py
index 6f6d946f77..f95671d05c 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/segment.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/segment.py
@@ -79,9 +79,16 @@ def segment_softmax(
     seg_max = segment_max(data_for_max, segment_ids, num_segments)
     # guard -inf (empty / fully-masked segments) so gather doesn't yield inf-inf
     seg_max = xp.where(xp.isinf(seg_max), xp.zeros_like(seg_max), seg_max)
-    shifted = data - xp.take(seg_max, segment_ids, axis=0)
+    # shift data_for_max (masked entries already -inf), NOT the raw data:
+    # a masked entry whose raw value exceeds the unmasked per-segment max by
+    # more than the exp overflow threshold (~709 fp64 / ~88 fp32) would give
+    # exp(+big) = inf, and the post-hoc inf * 0 mask multiply = nan, poisoning
+    # the WHOLE segment through the denominator. exp(-inf) = 0 exactly.
+    shifted = data_for_max - xp.take(seg_max, segment_ids, axis=0)
     ex = xp.exp(shifted)
     if mask is not None:
+        # defensive no-op after the -inf shift (exp(-inf) == 0); kept so the
+        # zero-weight guarantee never depends on the shift implementation
         ex = ex * xp.astype(mask, ex.dtype)
     denom = segment_sum(ex, segment_ids, num_segments)
     denom_e = xp.take(denom, segment_ids, axis=0)
diff --git a/source/tests/common/dpmodel/test_segment_softmax.py b/source/tests/common/dpmodel/test_segment_softmax.py
index b34ee8efaf..a97bd9c1aa 100644
--- a/source/tests/common/dpmodel/test_segment_softmax.py
+++ b/source/tests/common/dpmodel/test_segment_softmax.py
@@ -92,3 +92,21 @@ def test_torch_matches_numpy(self) -> None:
             mask=torch.from_numpy(mask),
         )
         np.testing.assert_allclose(out.numpy(), ref, atol=1e-12)
+
+
+def test_masked_entry_larger_than_unmasked_max_no_nan() -> None:
+    """A masked entry FAR ABOVE the unmasked max must not poison the segment.
+
+    Regression (CodeRabbit #5715): shifting the raw data let a huge masked
+    logit overflow exp() to inf, and inf * 0 (mask multiply) = nan summed into
+    the denominator, contaminating every entry of the segment. The shift must
+    use the masked (-inf) values so masked entries exp() to exactly zero.
+    """
+    data = np.array([1.0, 2.0, 1e5], dtype=np.float64)  # 1e5 - 2 >> 709
+    ids = np.zeros(3, dtype=np.int64)
+    mask = np.array([True, True, False])
+    out = segment_softmax(data, ids, 1, mask=mask)
+    assert np.all(np.isfinite(out))
+    ref = np.exp([1.0, 2.0]) / np.exp([1.0, 2.0]).sum()
+    np.testing.assert_allclose(out[:2], ref, rtol=1e-12)
+    assert out[2] == 0.0

From 7bf56b6bb5c384feddf8baa2d721810e476d67ce Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sat, 4 Jul 2026 09:15:14 +0800
Subject: [PATCH 12/38] docs(pt_expt): document sel-independent smooth graph
 attention; refresh eligibility wording

Address OutisLi review on #5715:
- Notes on DescrptDPA1.call_graph (+ pointers on uses_graph_lower and the
  freeze lower_kind docstring): for smooth_type_embedding=True the carry-all
  graph attention intentionally drops the dense layout's sel-padding terms
  from the softmax denominator - sel-independent semantics that differ from
  the legacy dense lower by up to ~1e-4; bit-tight dense parity holds for the
  non-smooth branch and for the static_nnei dense-adapter realization.
- Update the stale 'dpa1 attn_layer == 0' eligibility wording (freeze()
  docstring, training-path predicate/comment, dpmodel/pt_expt graph-lower
  docstrings) to the actual contract: mixed-types dpa1/se_atten with concat
  type embedding and no exclude_types, attention layers included; call_common
  docs no longer imply unconditional dense parity.
---
 deepmd/dpmodel/descriptor/dpa1.py  | 21 +++++++++++++++++++++
 deepmd/dpmodel/model/make_model.py | 14 +++++++++-----
 deepmd/pt_expt/entrypoints/main.py | 14 ++++++++++----
 deepmd/pt_expt/model/make_model.py |  2 +-
 deepmd/pt_expt/train/training.py   |  5 +++--
 5 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index 98259a5731..30a2d25e38 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -434,6 +434,12 @@ def uses_graph_lower(self) -> bool:
         with concat type-embedding and no type exclusion. Remaining ineligible
         configs (``tebd_input_mode == "strip"``, ``exclude_types``) fall back
         to the legacy dense path, so those models keep working unchanged.
+
+        Eligibility does NOT imply numerical interchangeability with the
+        dense route for every config: with ``smooth_type_embedding=True``
+        the carry-all graph attention is sel-independent by design and
+        differs from the dense lower by up to ~1e-4 (see the Notes of
+        :meth:`call_graph`).
         """
         return (
             self.se_atten.tebd_input_mode == "concat"
@@ -744,6 +750,21 @@ def call_graph(
         not produce the dense ``sw`` (that lives in the dense :meth:`call`
         adapter, which has the ``nlist``/``coord_ext`` needed to build it).
 
+        Notes
+        -----
+        **Smooth attention is intentionally sel-independent on the graph
+        path.** For ``smooth_type_embedding=True`` the legacy dense attention
+        keeps the sel-padding slots in its softmax DENOMINATOR (phantom
+        ``exp(-attnw_shift)`` terms), which makes dense output depend on the
+        ``sel`` setting by up to ~1e-4 even for identical physical neighbors.
+        A carry-all graph has no padding slots, so its softmax runs over the
+        real neighbor pairs only: cleaner, sel-independent semantics that
+        deliberately DIFFER from the dense route for smooth models. The two
+        routes agree bit-tight only for ``smooth_type_embedding=False`` (at
+        non-binding ``sel``), or when this kernel is realized on a dense
+        layout via ``static_nnei`` (the dense :meth:`call` adapter), which
+        reproduces the phantom terms for exact backward compatibility.
+
         Parameters
         ----------
         graph
diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index b3ce544377..be52bc9f22 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -313,10 +313,14 @@ def call_common(
 
                 The graph routes (``"dense"``/``"ase"``, and the pt_expt
                 default-flip) require a ``mixed_types`` descriptor with a graph
-                lower (dpa1 ``attn_layer == 0``).  At non-binding ``sel`` the
-                graph matches the dense path exactly; at binding ``sel`` the
-                carry-all graph keeps neighbors the dense path truncates, so the
-                energy intentionally differs.
+                lower (dpa1/se_atten with concat type embedding and no
+                ``exclude_types``; attention layers included).  At non-binding
+                ``sel`` the graph matches the dense path exactly for the
+                non-smooth branch; at binding ``sel`` the carry-all graph keeps
+                neighbors the dense path truncates, and for
+                ``smooth_type_embedding=True`` the graph drops the dense
+                layout's sel-padding softmax terms, so the energy intentionally
+                differs (sel-independent graph semantics).
 
             Returns
             -------
@@ -688,7 +692,7 @@ def call_common_lower_graph(
             comm_dict: dict | None = None,
             charge_spin: Array | None = None,
         ) -> dict[str, Array]:
-            """Graph-native PUBLIC lower (PR-A: dpa1 ``attn_layer == 0``).
+            """Graph-native PUBLIC lower (dpa1/se_atten concat-tebd, attention included).
 
             The PRIMARY directly-callable graph interface (spec decision #14).
             Casts inputs/outputs to/from the model precision exactly like the
diff --git a/deepmd/pt_expt/entrypoints/main.py b/deepmd/pt_expt/entrypoints/main.py
index 465e76bc45..eeb97dcd2c 100644
--- a/deepmd/pt_expt/entrypoints/main.py
+++ b/deepmd/pt_expt/entrypoints/main.py
@@ -498,10 +498,16 @@ def freeze(
     lower_kind : str
         Lower-level export form: ``"nlist"`` (default, dense neighbor-list lower)
         or ``"graph"`` (NeighborGraph edge-list lower). ``"graph"`` is only valid
-        for graph-eligible models (``mixed_types`` and ``uses_graph_lower``,
-        currently dpa1 with ``attn_layer == 0``) and selects the C++ graph
-        inference path; the per-atom virial is enabled for it (near-free in the
-        graph path: one extra scatter off the shared single backward).
+        for graph-eligible models (``mixed_types`` and ``uses_graph_lower``:
+        dpa1/se_atten with concat type embedding and no ``exclude_types``,
+        attention layers included) and selects the C++ graph inference path;
+        the per-atom virial is enabled for it (near-free in the graph path:
+        one extra scatter off the shared single backward). NOTE: for
+        ``smooth_type_embedding=True`` the carry-all graph attention
+        intentionally drops the dense layout's sel-padding terms from the
+        softmax denominator, so graph-form results are sel-independent and
+        differ from the legacy dense lower by up to ~1e-4 (see
+        ``DescrptDPA1.call_graph``).
     """
     import torch
 
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index 928149ca94..ae2e83eada 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -292,7 +292,7 @@ def forward_common_lower_graph(
             aparam: torch.Tensor | None = None,
             charge_spin: torch.Tensor | None = None,
         ) -> dict[str, torch.Tensor]:
-            """Graph-native lower with autograd force/virial (PR-A: dpa1 ``attn_layer==0``).
+            """Graph-native lower with autograd force/virial (dpa1/se_atten concat-tebd, attention included).
 
             OUTPUT-AGNOSTIC: runs the graph descriptor + fitting forward with
             ``edge_vec`` as the autograd leaf (via the inherited
diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index 1ae51fd483..ad00e0b947 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -587,7 +587,8 @@ def _model_uses_graph_lower(model: torch.nn.Module) -> bool:
     :meth:`~deepmd.pt_expt.model.make_model.make_model.<locals>.CM._resolve_graph_method`
     for ``neighbor_graph_method is None`` (the training default): a model is
     graph-eligible iff it is ``mixed_types`` AND its single descriptor reports
-    ``uses_graph_lower() == True`` (currently only dpa1 ``attn_layer == 0``).
+    ``uses_graph_lower() == True`` (dpa1/se_atten with concat type embedding
+    and no ``exclude_types``; attention layers included).
 
     When True the compiled lower must be the GRAPH ``forward_common_lower_graph``
     so the compiled path matches eager training (which already default-flips to
@@ -906,7 +907,7 @@ def forward(
         nframes, nloc = atype.shape[:2]
         rcut = self.original_model.get_rcut()
 
-        # Graph-eligible models (dpa1 attn_layer==0) default-flip to the carry-all
+        # Graph-eligible models (dpa1 concat-tebd, incl. attention) default-flip to the carry-all
         # GRAPH forward in eager training; the compiled lower must be the GRAPH
         # lower too, otherwise the eager (graph) and compiled (dense) backward
         # gradients diverge at fp64 accumulation and the optimizer amplifies it.

From d49b9cb4388a5e3f7d14a0a0b600d53dc5f8d5ca Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sat, 4 Jul 2026 18:25:12 +0800
Subject: [PATCH 13/38] fix(pt_expt): fail fast on torch < 2.6 for graph
 attention tracing

---
 deepmd/pt_expt/train/training.py              |  2 +
 deepmd/pt_expt/utils/serialization.py         | 32 +++++++++
 .../pt_expt/utils/test_graph_pt2_metadata.py  | 71 +++++++++++++++++++
 3 files changed, 105 insertions(+)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index ad00e0b947..d2868a4082 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -722,8 +722,10 @@ def _trace_and_compile_graph(
     # float precision and device; optional tensors match the actual call.
     from deepmd.pt_expt.utils.serialization import (
         build_synthetic_graph_inputs,
+        check_graph_trace_torch_version,
     )
 
+    check_graph_trace_torch_version(model)
     sample = build_synthetic_graph_inputs(
         model,
         e_max=e_max,
diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
index b9d1531769..2af7aa0fa8 100644
--- a/deepmd/pt_expt/utils/serialization.py
+++ b/deepmd/pt_expt/utils/serialization.py
@@ -141,6 +141,37 @@ def _needs_with_comm_artifact(model: torch.nn.Module) -> bool:
         return False
 
 
+def check_graph_trace_torch_version(model: torch.nn.Module) -> None:
+    """Fail fast when the graph trace needs unbacked-SymInt support torch lacks.
+
+    The compact ``center_edge_pairs`` realization used by graph attention
+    (``attn_layer > 0``) relies on unbacked-SymInt tracing
+    (``torch._check_is_size`` hints on ``nonzero`` / tensor-``repeat`` outputs,
+    see ``deepmd/dpmodel/utils/neighbor_graph/pairs.py``), which is only solid
+    from torch >= 2.6. On older torch the trace dies deep inside
+    ``make_fx``/AOTI with an obscure ``GuardOnDataDependentSymNode`` (or an
+    ``AttributeError`` on ``_check_is_size``), so both graph trace sites (the
+    ``.pt2`` export below and the training compile in
+    ``training._trace_and_compile_graph``) call this guard first. Factorizable
+    models (``attn_layer == 0``) trace with backed symbols only and are not
+    restricted.
+    """
+    desc = getattr(getattr(model, "atomic_model", None), "descriptor", None)
+    get_n_attn = getattr(desc, "get_numb_attn_layer", None)
+    n_attn = get_n_attn() if get_n_attn is not None else 0
+    if n_attn <= 0:
+        return
+    version = torch.__version__.split("+")[0]
+    major_minor = tuple(int(p) for p in version.split(".")[:2] if p.isdigit())
+    if len(major_minor) == 2 and major_minor < (2, 6):
+        raise RuntimeError(
+            f"graph-form tracing of attention layers (attn_layer={n_attn}) "
+            f"requires torch >= 2.6 (unbacked-SymInt support for the compact "
+            f"center_edge_pairs realization); found torch {torch.__version__}. "
+            "Upgrade torch, set 'attn_layer: 0', or use the dense (nlist) path."
+        )
+
+
 # Module-level cache for the trace-time sendlist buffer. The pointer
 # value embedded in ``send_list_tensor`` references this numpy array's
 # data; the array must outlive the trace + export call.  Caching here
@@ -889,6 +920,7 @@ def _trace_and_export(
     if lower_kind == "graph":
         import math
 
+        check_graph_trace_torch_version(model)
         if is_spin:
             raise NotImplementedError(
                 "graph-form .pt2 export is not supported for spin models"
diff --git a/source/tests/pt_expt/utils/test_graph_pt2_metadata.py b/source/tests/pt_expt/utils/test_graph_pt2_metadata.py
index a541f744cc..54aa9f688d 100644
--- a/source/tests/pt_expt/utils/test_graph_pt2_metadata.py
+++ b/source/tests/pt_expt/utils/test_graph_pt2_metadata.py
@@ -132,3 +132,74 @@ def test_neighbor_graph_method_rejected_on_nlist_artifact(dpa1_dpmodel_data) ->
             DeepPot(p, neighbor_graph_method="vesin")
         # the default stays accepted (no behavior change)
         DeepPot(p)
+
+
+class _FakeDesc:
+    def __init__(self, n_attn: int) -> None:
+        self._n = n_attn
+
+    def get_numb_attn_layer(self) -> int:
+        return self._n
+
+
+class _FakeAtomicModel:
+    def __init__(self, n_attn: int) -> None:
+        self.descriptor = _FakeDesc(n_attn)
+
+
+class _FakeModel:
+    def __init__(self, n_attn: int) -> None:
+        self.atomic_model = _FakeAtomicModel(n_attn)
+
+
+@pytest.mark.parametrize(
+    "version", ["2.5.1", "2.5.1+cu124"]
+)  # torch below the 2.6 floor
+def test_graph_trace_version_guard_rejects_attention_on_old_torch(
+    monkeypatch, version
+) -> None:
+    """attn_layer > 0 on torch < 2.6 fails fast with a clear message."""
+    import torch
+
+    from deepmd.pt_expt.utils.serialization import (
+        check_graph_trace_torch_version,
+    )
+
+    monkeypatch.setattr(torch, "__version__", version)
+    with pytest.raises(RuntimeError, match=r"torch >= 2\.6"):
+        check_graph_trace_torch_version(_FakeModel(2))
+
+
+@pytest.mark.parametrize(
+    ("version", "n_attn"),
+    [
+        ("2.5.1", 0),  # old torch OK without attention (backed symbols only)
+        ("2.6.0", 2),  # floor version with attention
+        ("2.10.0+cu126", 2),  # current torch with attention, local suffix
+    ],
+)
+def test_graph_trace_version_guard_passes(monkeypatch, version, n_attn) -> None:
+    """No-attention models and torch >= 2.6 pass the guard silently."""
+    import torch
+
+    from deepmd.pt_expt.utils.serialization import (
+        check_graph_trace_torch_version,
+    )
+
+    monkeypatch.setattr(torch, "__version__", version)
+    check_graph_trace_torch_version(_FakeModel(n_attn))
+
+
+def test_graph_trace_version_guard_tolerates_no_descriptor(monkeypatch) -> None:
+    """Composite models without a single descriptor pass (dense route anyway)."""
+    import torch
+
+    from deepmd.pt_expt.utils.serialization import (
+        check_graph_trace_torch_version,
+    )
+
+    class _NoDesc:
+        pass
+
+    monkeypatch.setattr(torch, "__version__", "2.5.1")
+    check_graph_trace_torch_version(_NoDesc())

From 7c65935aa8b221b2685db18d05dc7f3de45ac54c Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sat, 4 Jul 2026 18:28:17 +0800
Subject: [PATCH 14/38] docs(pt_expt): numpydoc sections for
 check_graph_trace_torch_version

---
 deepmd/pt_expt/utils/serialization.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
index 2af7aa0fa8..9317e87f4d 100644
--- a/deepmd/pt_expt/utils/serialization.py
+++ b/deepmd/pt_expt/utils/serialization.py
@@ -155,6 +155,20 @@ def check_graph_trace_torch_version(model: torch.nn.Module) -> None:
     ``training._trace_and_compile_graph``) call this guard first. Factorizable
     models (``attn_layer == 0``) trace with backed symbols only and are not
     restricted.
+
+    Parameters
+    ----------
+    model
+        The graph-eligible model about to be traced. The attention depth is
+        read from ``model.atomic_model.descriptor.get_numb_attn_layer()``;
+        models without a single descriptor (linear/zbl/frozen) pass the
+        check (they take the dense route anyway).
+
+    Raises
+    ------
+    RuntimeError
+        If the descriptor has ``attn_layer > 0`` and the running torch is
+        older than 2.6.
     """
     desc = getattr(getattr(model, "atomic_model", None), "descriptor", None)
     get_n_attn = getattr(desc, "get_numb_attn_layer", None)

From 6fc45bd26e3919601abeb27449a17d3a6717816d Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sat, 4 Jul 2026 19:12:13 +0800
Subject: [PATCH 15/38] test+docs: pin smooth-attention graph-vs-dense
 divergence; document the pt_expt graph default and legacy escape hatch

---
 doc/model/train-se-atten.md                   |  5 +++
 .../pt_expt/model/test_dpa1_graph_lower.py    | 34 +++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/doc/model/train-se-atten.md b/doc/model/train-se-atten.md
index e504207ac2..177c652bed 100644
--- a/doc/model/train-se-atten.md
+++ b/doc/model/train-se-atten.md
@@ -157,6 +157,11 @@ In other backends, type embedding is within this descriptor with the {ref}`tebd_
 TensorFlow and other backends have different implementations for {ref}`smooth_type_embedding <model[standard]/descriptor[se_atten_v2]/smooth_type_embedding>`.
 The results are inconsistent when `smooth_type_embedding` is `true`.
 
+In the pt_expt backend, graph-eligible descriptors (mixed types, `tebd_input_mode` `"concat"`, no descriptor-level `exclude_types` or compression) are evaluated by default through the carry-all neighbor-graph path instead of the legacy dense neighbor list.
+The graph path considers all neighbors within the cutoff, so its result does not depend on {ref}`sel <model[standard]/descriptor[se_atten]/sel>`.
+When `smooth_type_embedding` is `true` and {ref}`attn_layer <model[standard]/descriptor[se_atten]/attn_layer>` is larger than 0 (the defaults), the dense path keeps `sel`-padding phantom terms in the attention softmax denominator while the graph path drops them, so checkpoints trained under the dense semantics shift by up to about 1e-4 in energy when evaluated on the graph path.
+Passing `neighbor_graph_method="legacy"` to the model forward (or the corresponding evaluation option) restores the dense-path numbers exactly.
+
 In the TensorFlow backend, {ref}`scaling_factor <model[standard]/descriptor[se_atten]/scaling_factor>` cannot set to a value other than `1.0`;
 {ref}`normalize <model[standard]/descriptor[se_atten]/normalize>` cannot be set to `false`;
 {ref}`temperature <model[standard]/descriptor[se_atten]/temperature>` cannot be set;
diff --git a/source/tests/pt_expt/model/test_dpa1_graph_lower.py b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
index 5dc98b4784..135c0e04f3 100644
--- a/source/tests/pt_expt/model/test_dpa1_graph_lower.py
+++ b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
@@ -291,6 +291,40 @@ def test_graph_lower_symbolic_trace(self, attn_layer) -> None:
             out["virial"], ref["energy_derv_c_redu"].reshape(out["virial"].shape), **tol
         )
 
+    def test_smooth_attention_divergence_pinned(self) -> None:
+        """End-to-end: the pt_expt DEFAULT route (carry-all graph) diverges
+        from the dense route for ``smooth_type_embedding=True`` + attention —
+        nonzero and bounded by the documented ~1e-4 magnitude.
+
+        The carry-all graph drops sel-padding phantom terms from the smooth
+        attention softmax denominator BY DESIGN (NeighborGraph PR-D), while
+        the dense path keeps them, so dense output is sel-dependent.  This
+        test pins that divergence at the public model forward so a future
+        refactor cannot silently change the carry-all smooth semantics.
+        ``neighbor_graph_method="legacy"`` is the escape hatch restoring the
+        dense numbers; the parity tests above cover the smooth=False regime
+        where the two routes agree bit-tight.
+        """
+        model = self._make_model(attn_layer=2, smooth=True)
+        model.eval()
+        coord = self.coord.clone().requires_grad_(True)
+        box = self.cell.reshape(1, 9)
+        # None = the default flip: graph-eligible mixed_types -> carry-all graph
+        graph = model.call_common(coord, self.atype, box, neighbor_graph_method=None)
+        dense = model.call_common(
+            self.coord.clone().requires_grad_(True),
+            self.atype,
+            box,
+            neighbor_graph_method="legacy",
+        )
+        e_diff = (graph["energy_redu"] - dense["energy_redu"]).abs().max().item()
+        f_diff = (graph["energy_derv_r"] - dense["energy_derv_r"]).abs().max().item()
+        # nonzero: well above fp64 accumulation noise of a bit-tight parity
+        assert e_diff > 1e-10, f"expected smooth divergence, got {e_diff:.3e}"
+        # bounded: the documented magnitude is ~1e-4; 1e-3 leaves headroom
+        assert e_diff < 1e-3, f"smooth divergence too large: {e_diff:.3e}"
+        assert f_diff < 1e-3, f"smooth force divergence too large: {f_diff:.3e}"
+
     @pytest.mark.parametrize("attn_layer", [0, 2])  # factorizable AND attention
     def test_graph_route_float32(self, attn_layer) -> None:
         """A float32 model runs the graph route and matches the dense route.

From ce98ea5b43bd03eccb5c17a9c637054d8299c5b7 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 00:38:07 +0800
Subject: [PATCH 16/38] feat(dpmodel): canonical apply_pair_exclusion graph
 transform (decision #18)

---
 .../dpmodel/utils/neighbor_graph/__init__.py  |   2 +
 deepmd/dpmodel/utils/neighbor_graph/graph.py  |  65 +++++++
 .../dpmodel/test_apply_pair_exclusion.py      | 168 ++++++++++++++++++
 3 files changed, 235 insertions(+)
 create mode 100644 source/tests/common/dpmodel/test_apply_pair_exclusion.py

diff --git a/deepmd/dpmodel/utils/neighbor_graph/__init__.py b/deepmd/dpmodel/utils/neighbor_graph/__init__.py
index 24fb090309..5cb84d4e9d 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/__init__.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/__init__.py
@@ -28,6 +28,7 @@
 from .graph import (
     GraphLayout,
     NeighborGraph,
+    apply_pair_exclusion,
     frame_id_from_n_node,
     node_validity_mask,
     pad_and_guard_edges,
@@ -45,6 +46,7 @@
 __all__ = [
     "GraphLayout",
     "NeighborGraph",
+    "apply_pair_exclusion",
     "build_neighbor_graph",
     "build_neighbor_graph_ase",
     "center_edge_pairs",
diff --git a/deepmd/dpmodel/utils/neighbor_graph/graph.py b/deepmd/dpmodel/utils/neighbor_graph/graph.py
index 0ce10efdf6..20a12be6d7 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/graph.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/graph.py
@@ -25,6 +25,9 @@
     from deepmd.dpmodel.array_api import (
         Array,
     )
+    from deepmd.dpmodel.utils.exclude_mask import (
+        PairExcludeMask,
+    )
 
 
 @dataclass
@@ -167,6 +170,68 @@ def frame_id_from_n_node(n_node: Array, n_total: int | None = None) -> Array:
     return xp.minimum(frame_id, xp.astype(last_frame, xp.int64))
 
 
+def apply_pair_exclusion(
+    graph: NeighborGraph,
+    atype: Array,
+    pair_excl: PairExcludeMask | None,
+    *,
+    compact: bool = False,
+) -> NeighborGraph:
+    """Canonical pair-type exclusion transform (decision #18).
+
+    ANDs the per-edge type keep-mask into ``graph.edge_mask`` so excluded
+    type pairs contribute exactly zero to every downstream ``segment_sum``.
+    The search stays purely geometric; this transform is applied ONCE at the
+    atomic-model seam (model-level ``pair_exclude_types``) and, for
+    descriptor-level ``exclude_types``, inside the descriptor's graph
+    forward. Identity (returns ``graph`` itself) when ``pair_excl`` is
+    ``None`` or empty.
+
+    Parameters
+    ----------
+    graph
+        The neighbor graph; only ``edge_mask`` (and, if ``compact=True``,
+        ``edge_index``, ``edge_vec``, ``angle_index``, ``angle_mask``) are
+        replaced.
+    atype
+        (N,) flat node types, clamped >= 0 (virtual atoms already handled
+        by the caller / the builders).
+    pair_excl
+        The ``PairExcludeMask`` holding the excluded (ti, tj) set.
+    compact
+        If ``False`` (default), only zero-out masked edges via ``edge_mask``
+        (shape-static; the ONLY mode allowed in compiled / AOTI paths).
+        If ``True``, additionally drop masked edges so the returned graph
+        has no padding on the edge axis (data-dependent shape; eager /
+        dynamic-nedge only).
+
+    Returns
+    -------
+    NeighborGraph
+        A ``dataclasses.replace`` copy (or the original ``graph`` on early
+        exit) with the exclusion applied.
+    """
+    import dataclasses
+
+    if pair_excl is None or len(pair_excl.get_exclude_types()) == 0:
+        return graph
+    xp = array_api_compat.array_namespace(graph.edge_mask)
+    keep = pair_excl.build_edge_exclude_mask(graph.edge_index, atype)
+    out = dataclasses.replace(
+        graph,
+        edge_mask=graph.edge_mask * xp.astype(keep, graph.edge_mask.dtype),
+    )
+    if compact:
+        (keep_idx,) = xp.nonzero(out.edge_mask)
+        out = dataclasses.replace(
+            out,
+            edge_index=out.edge_index[:, keep_idx],
+            edge_vec=xp.take(out.edge_vec, keep_idx, axis=0),
+            edge_mask=xp.take(out.edge_mask, keep_idx, axis=0),
+        )
+    return out
+
+
 def node_validity_mask(n_node: Array, n_total: int) -> Array:
     """Derive the (n_total,) real-vs-padding node mask from per-frame counts.
 
diff --git a/source/tests/common/dpmodel/test_apply_pair_exclusion.py b/source/tests/common/dpmodel/test_apply_pair_exclusion.py
new file mode 100644
index 0000000000..403f2b940e
--- /dev/null
+++ b/source/tests/common/dpmodel/test_apply_pair_exclusion.py
@@ -0,0 +1,168 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import numpy as np
+import pytest
+
+from deepmd.dpmodel.utils.exclude_mask import PairExcludeMask
+from deepmd.dpmodel.utils.neighbor_graph import (
+    NeighborGraph,
+    apply_pair_exclusion,
+)
+
+
+def _toy_graph():
+    # 4 nodes, types [0, 1, 0, 1]; 5 edges incl. one already-masked pad edge.
+    # edge_index rows: [src(neighbor), dst(center)]
+    edge_index = np.array([[1, 2, 3, 0, 0], [0, 0, 1, 3, 0]], dtype=np.int64)
+    edge_vec = np.ones((5, 3), dtype=np.float64)
+    edge_mask = np.array([1, 1, 1, 1, 0], dtype=np.int32)  # last = padding
+    n_node = np.array([4], dtype=np.int64)
+    return NeighborGraph(
+        n_node=n_node,
+        edge_index=edge_index,
+        edge_vec=edge_vec,
+        edge_mask=edge_mask,
+    )
+
+
+def test_none_and_empty_are_identity() -> None:
+    g = _toy_graph()
+    atype = np.array([0, 1, 0, 1], dtype=np.int64)
+    assert apply_pair_exclusion(g, atype, None) is g
+    assert apply_pair_exclusion(g, atype, PairExcludeMask(2, [])) is g
+
+
+def test_excluded_pairs_are_masked_and_padding_stays_masked() -> None:
+    g = _toy_graph()
+    atype = np.array([0, 1, 0, 1], dtype=np.int64)
+    out = apply_pair_exclusion(g, atype, PairExcludeMask(2, [(0, 1)]))
+    # edges (dst_t, src_t): e0 (0,1) excl, e1 (0,0) keep, e2 (1,1) keep,
+    # e3 (1,0) excl (symmetric), e4 padding stays 0.
+    np.testing.assert_array_equal(out.edge_mask, [0, 1, 1, 0, 0])
+    # non-mask fields untouched, input not mutated
+    np.testing.assert_array_equal(g.edge_mask, [1, 1, 1, 1, 0])
+    assert out.edge_index is g.edge_index
+    assert out.edge_vec is g.edge_vec
+
+
+def test_no_exclusion_empty_list_is_identity() -> None:
+    """Cover PairExcludeMask with non-None but empty exclude list."""
+    g = _toy_graph()
+    atype = np.array([0, 1, 0, 1], dtype=np.int64)
+    result = apply_pair_exclusion(g, atype, PairExcludeMask(2, []))
+    assert result is g
+
+
+def test_no_excluded_edges_in_graph() -> None:
+    """Exclusion list non-empty but no edge matches — all real edges stay."""
+    g = _toy_graph()
+    atype = np.array([0, 0, 0, 0], dtype=np.int64)  # all same type
+    out = apply_pair_exclusion(g, atype, PairExcludeMask(2, [(0, 1)]))
+    # (0,1) never appears — all edges kept (except pre-existing padding)
+    np.testing.assert_array_equal(out.edge_mask, [1, 1, 1, 1, 0])
+
+
+def test_torch_namespace_smoke() -> None:
+    torch = pytest.importorskip("torch")
+    g = _toy_graph()
+    gt = NeighborGraph(
+        n_node=torch.from_numpy(g.n_node),
+        edge_index=torch.from_numpy(g.edge_index),
+        edge_vec=torch.from_numpy(g.edge_vec),
+        edge_mask=torch.from_numpy(g.edge_mask),
+    )
+    atype = torch.tensor([0, 1, 0, 1], dtype=torch.int64)
+    out = apply_pair_exclusion(gt, atype, PairExcludeMask(2, [(0, 1)]))
+    np.testing.assert_array_equal(out.edge_mask.numpy(), [0, 1, 1, 0, 0])
+
+
+# ---------------------------------------------------------------------------
+# compact=True tests
+# ---------------------------------------------------------------------------
+
+
+def test_compact_drops_masked_edges() -> None:
+    """compact=True must keep exactly the valid edges (after exclusion)."""
+    g = _toy_graph()
+    atype = np.array([0, 1, 0, 1], dtype=np.int64)
+    out_mask = apply_pair_exclusion(g, atype, PairExcludeMask(2, [(0, 1)]))
+    out_compact = apply_pair_exclusion(
+        g, atype, PairExcludeMask(2, [(0, 1)]), compact=True
+    )
+    # Expected kept edges: indices 1 and 2 (mask-only has [0,1,1,0,0])
+    assert out_compact.edge_index.shape[1] == 2
+    assert out_compact.edge_vec.shape[0] == 2
+    assert out_compact.edge_mask.shape[0] == 2
+    # all remaining edge_mask entries must be 1
+    np.testing.assert_array_equal(out_compact.edge_mask, [1, 1])
+    # edge_index content matches kept edges from mask path
+    np.testing.assert_array_equal(
+        out_compact.edge_index, out_mask.edge_index[:, [1, 2]]
+    )
+
+
+def test_compact_drops_preexisting_padding_too() -> None:
+    """Pre-existing padding (edge 4) must be dropped even with no exclusions."""
+    g = _toy_graph()
+    atype = np.array([0, 0, 0, 0], dtype=np.int64)  # no type exclusions
+    # compact=True with empty exclusion list -> graph has no exclusion keep_idx change
+    # The brief says compact on identity returns graph unchanged,
+    # but with a non-empty excl list that matches nothing, out has same edge_mask as g.
+    # Let's use a real exclusion that changes something so compact is non-trivial:
+    out = apply_pair_exclusion(g, atype, PairExcludeMask(2, [(0, 1)]), compact=True)
+    # (0,1) never appears in this graph (all types are 0) → all real edges kept
+    # but pre-existing padding (edge 4) should be dropped
+    assert out.edge_index.shape[1] == 4  # only 4 real edges, padding gone
+    np.testing.assert_array_equal(out.edge_mask, [1, 1, 1, 1])
+
+
+def test_compact_torch_smoke() -> None:
+    torch = pytest.importorskip("torch")
+    g = _toy_graph()
+    gt = NeighborGraph(
+        n_node=torch.from_numpy(g.n_node),
+        edge_index=torch.from_numpy(g.edge_index),
+        edge_vec=torch.from_numpy(g.edge_vec),
+        edge_mask=torch.from_numpy(g.edge_mask),
+    )
+    atype = torch.tensor([0, 1, 0, 1], dtype=torch.int64)
+    out = apply_pair_exclusion(gt, atype, PairExcludeMask(2, [(0, 1)]), compact=True)
+    np.testing.assert_array_equal(out.edge_mask.numpy(), [1, 1])
+    assert out.edge_index.shape[1] == 2
+
+
+def test_compact_invariance_vs_mask_only() -> None:
+    """Descriptor-level invariance: segment_sum over mask-only == compact.
+
+    Masked edges contribute zero to the sum; dropping them should give identical
+    results.
+    """
+    g = _toy_graph()
+    atype = np.array([0, 1, 0, 1], dtype=np.int64)
+    excl = PairExcludeMask(2, [(0, 1)])
+
+    out_mask = apply_pair_exclusion(g, atype, excl, compact=False)
+    out_compact = apply_pair_exclusion(g, atype, excl, compact=True)
+
+    # Build fake per-edge values (like edge_env_mat output)
+    vals_mask = np.arange(5, dtype=np.float64).reshape(5, 1) + 1.0
+    vals_mask_valid = vals_mask * out_mask.edge_mask[:, None]
+
+    # Map compact edge indices to the original edge values
+    # Kept edges are 1 and 2 (mask [0,1,1,0,0])
+    vals_compact = vals_mask[out_mask.edge_mask.astype(bool)]
+
+    # segment_sum over dst (center) node axis: 4 nodes
+    N = 4
+    dst_mask = out_mask.edge_index[1]  # (5,)
+    dst_compact = out_compact.edge_index[1]  # (2,)
+
+    # manual segment_sum for mask path
+    result_mask = np.zeros((N, 1), dtype=np.float64)
+    for ei, v in zip(dst_mask, vals_mask_valid, strict=True):
+        result_mask[ei] += v
+
+    result_compact = np.zeros((N, 1), dtype=np.float64)
+    for ei, v in zip(dst_compact, vals_compact, strict=True):
+        result_compact[ei] += v
+
+    np.testing.assert_allclose(result_mask, result_compact)

From fffca360628d53cd3d1be0d95982d633f002e35e Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 00:40:14 +0800
Subject: [PATCH 17/38] fix(dpmodel): raise NotImplementedError in
 apply_pair_exclusion(compact=True) when angle fields are present

---
 deepmd/dpmodel/utils/neighbor_graph/graph.py  |  8 ++++
 .../dpmodel/test_apply_pair_exclusion.py      | 47 +++++++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/deepmd/dpmodel/utils/neighbor_graph/graph.py b/deepmd/dpmodel/utils/neighbor_graph/graph.py
index 20a12be6d7..dd82b545de 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/graph.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/graph.py
@@ -222,6 +222,14 @@ def apply_pair_exclusion(
         edge_mask=graph.edge_mask * xp.astype(keep, graph.edge_mask.dtype),
     )
     if compact:
+        if graph.angle_index is not None or graph.angle_mask is not None:
+            raise NotImplementedError(
+                "apply_pair_exclusion(compact=True) is not supported when the "
+                "NeighborGraph carries angle fields (angle_index / angle_mask). "
+                "Angle indices reference pre-compaction edge positions and would "
+                "become silently wrong after edge compaction. Either use "
+                "compact=False (mask-only mode) or strip the angle fields first."
+            )
         (keep_idx,) = xp.nonzero(out.edge_mask)
         out = dataclasses.replace(
             out,
diff --git a/source/tests/common/dpmodel/test_apply_pair_exclusion.py b/source/tests/common/dpmodel/test_apply_pair_exclusion.py
index 403f2b940e..2dad917120 100644
--- a/source/tests/common/dpmodel/test_apply_pair_exclusion.py
+++ b/source/tests/common/dpmodel/test_apply_pair_exclusion.py
@@ -166,3 +166,50 @@ def test_compact_invariance_vs_mask_only() -> None:
         result_compact[ei] += v
 
     np.testing.assert_allclose(result_mask, result_compact)
+
+
+# ---------------------------------------------------------------------------
+# compact=True with angle fields — must raise NotImplementedError
+# ---------------------------------------------------------------------------
+
+
+def _toy_graph_with_angles():
+    """Same base graph as _toy_graph but with angle_index/angle_mask populated."""
+    g = _toy_graph()
+    import dataclasses
+
+    # Two toy angles (pairs of edges sharing a center)
+    angle_index = np.array([[0, 1], [1, 2]], dtype=np.int64)
+    angle_mask = np.array([1, 1], dtype=np.int32)
+    return dataclasses.replace(g, angle_index=angle_index, angle_mask=angle_mask)
+
+
+def test_compact_raises_when_angle_index_present() -> None:
+    """compact=True must raise NotImplementedError when angle_index is set."""
+    g = _toy_graph_with_angles()
+    atype = np.array([0, 1, 0, 1], dtype=np.int64)
+    with pytest.raises(NotImplementedError, match="angle"):
+        apply_pair_exclusion(g, atype, PairExcludeMask(2, [(0, 1)]), compact=True)
+
+
+def test_compact_raises_when_only_angle_mask_present() -> None:
+    """compact=True must raise even when only angle_mask (not angle_index) is set."""
+    import dataclasses
+
+    g = _toy_graph()
+    angle_mask = np.array([1], dtype=np.int32)
+    g_with_mask = dataclasses.replace(g, angle_mask=angle_mask)
+    atype = np.array([0, 1, 0, 1], dtype=np.int64)
+    with pytest.raises(NotImplementedError, match="angle"):
+        apply_pair_exclusion(
+            g_with_mask, atype, PairExcludeMask(2, [(0, 1)]), compact=True
+        )
+
+
+def test_compact_works_when_angle_fields_are_none() -> None:
+    """compact=True must NOT raise when angle_index and angle_mask are both None."""
+    g = _toy_graph()  # angle_index=None, angle_mask=None by default
+    atype = np.array([0, 1, 0, 1], dtype=np.int64)
+    # Should succeed; reuse the existing compact assertion
+    out = apply_pair_exclusion(g, atype, PairExcludeMask(2, [(0, 1)]), compact=True)
+    assert out.edge_index.shape[1] == 2

From ed6334e676b710c0491817ada09db289b12ae7e3 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 00:46:19 +0800
Subject: [PATCH 18/38] refactor(dpmodel): atomic-model pair exclusion via
 apply_pair_exclusion

---
 .../dpmodel/atomic_model/base_atomic_model.py | 14 +++-----
 .../dpmodel/test_graph_atomic_parity.py       | 36 +++++++++++++++++++
 2 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/deepmd/dpmodel/atomic_model/base_atomic_model.py b/deepmd/dpmodel/atomic_model/base_atomic_model.py
index bf41735f89..866bb22329 100644
--- a/deepmd/dpmodel/atomic_model/base_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/base_atomic_model.py
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-import dataclasses
 import functools
 import math
 from collections.abc import (
@@ -10,6 +9,10 @@
     Any,
 )
 
+from deepmd.dpmodel.utils.neighbor_graph import (
+    apply_pair_exclusion,
+)
+
 if TYPE_CHECKING:
     from deepmd.dpmodel.utils.neighbor_graph import (
         NeighborGraph,
@@ -356,14 +359,7 @@ def forward_common_atomic_graph(
         atype = xp.asarray(atype, device=array_api_compat.device(graph.edge_vec))
         atom_mask = self.make_atom_mask(atype)  # (N,) bool
         atype_clamped = xp.where(atom_mask, atype, xp.zeros_like(atype))
-        if self.pair_excl is not None:
-            keep = self.pair_excl.build_edge_exclude_mask(
-                graph.edge_index, atype_clamped
-            )
-            graph = dataclasses.replace(
-                graph,
-                edge_mask=graph.edge_mask * xp.astype(keep, graph.edge_mask.dtype),
-            )
+        graph = apply_pair_exclusion(graph, atype_clamped, self.pair_excl)
         ret_dict = self.forward_atomic_graph(
             graph,
             atype_clamped,
diff --git a/source/tests/common/dpmodel/test_graph_atomic_parity.py b/source/tests/common/dpmodel/test_graph_atomic_parity.py
index 7de084a25f..224b25f852 100644
--- a/source/tests/common/dpmodel/test_graph_atomic_parity.py
+++ b/source/tests/common/dpmodel/test_graph_atomic_parity.py
@@ -15,8 +15,12 @@
     EnergyModel,
 )
 from deepmd.dpmodel.utils.neighbor_graph import (
+    apply_pair_exclusion,
     from_dense_quartet,
 )
+from deepmd.dpmodel.utils.exclude_mask import (
+    PairExcludeMask,
+)
 from deepmd.dpmodel.utils.nlist import (
     extend_input_and_build_neighbor_list,
 )
@@ -306,3 +310,35 @@ def test_graph_matches_dense_with_out_bias():
         )
     # non-vacuous: the bias actually shifted the graph energy
     assert not np.allclose(np.asarray(g["energy"]), np.asarray(g_zero["energy"]))
+
+
+# ── apply_pair_exclusion idempotence (Task 2) ─────────────────────────────────
+
+
+@pytest.mark.parametrize(
+    "pair_exclude_types", [[], [(0, 1)]]
+)  # empty branch AND non-empty branch
+def test_apply_pair_exclusion_idempotent(pair_exclude_types):
+    """Applying apply_pair_exclusion twice gives the same edge_mask as once.
+
+    Covers both the empty pair_excl branch (identity) and non-empty branch.
+    """
+    rng = np.random.default_rng(42)
+    coord = rng.normal(size=(1, 5, 3)) * 1.5
+    atype = np.array([[0, 1, 0, 1, 0]], dtype=np.int64)
+    ds = DescrptDPA1(rcut=4.0, rcut_smth=0.5, sel=[200], ntypes=2, attn_layer=0)
+    ft = InvarFitting("energy", 2, ds.get_dim_out(), 1, mixed_types=True)
+    am = DPAtomicModel(ds, ft, type_map=["a", "b"])
+    ext_coord, ext_atype, mapping, nlist = extend_input_and_build_neighbor_list(
+        coord, atype, 4.0, [200], mixed_types=True, box=None
+    )
+    ng = from_dense_quartet(ext_coord, nlist, mapping)
+    pair_excl = PairExcludeMask(2, pair_exclude_types) if pair_exclude_types else None
+    atype_flat = atype.reshape(-1)
+    once = apply_pair_exclusion(ng, atype_flat, pair_excl)
+    twice = apply_pair_exclusion(once, atype_flat, pair_excl)
+    # Masks must be exactly equal (AND-idempotent for 0/1 values)
+    np.testing.assert_array_equal(
+        np.asarray(once.edge_mask),
+        np.asarray(twice.edge_mask),
+    )

From 6c2b007c969590634f64acc000579129d943d5a7 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 00:54:58 +0800
Subject: [PATCH 19/38] feat(dpmodel): dpa1 graph path supports exclude_types
 via apply_pair_exclusion; drop eligibility gate

---
 deepmd/dpmodel/descriptor/dpa1.py             | 48 +++++++-----
 .../dpmodel/test_dpa1_call_graph_block.py     | 59 ++++++++++----
 .../test_dpa1_call_graph_descriptor.py        | 78 ++++++++++++++++---
 .../dpmodel/test_graph_atomic_parity.py       |  6 +-
 4 files changed, 142 insertions(+), 49 deletions(-)

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index 30a2d25e38..fbd71d3f7c 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -431,9 +431,10 @@ def uses_graph_lower(self) -> bool:
 
         The graph-native lower (``call_graph``) covers the factorizable path
         AND transformer attention (``attn_layer >= 0``, NeighborGraph PR-D)
-        with concat type-embedding and no type exclusion. Remaining ineligible
-        configs (``tebd_input_mode == "strip"``, ``exclude_types``) fall back
-        to the legacy dense path, so those models keep working unchanged.
+        with concat type-embedding.  ``exclude_types`` is fully supported via
+        :func:`~deepmd.dpmodel.utils.neighbor_graph.apply_pair_exclusion`.
+        The only remaining ineligible config is ``tebd_input_mode == "strip"``,
+        which falls back to the legacy dense path.
 
         Eligibility does NOT imply numerical interchangeability with the
         dense route for every config: with ``smooth_type_embedding=True``
@@ -441,10 +442,7 @@ def uses_graph_lower(self) -> bool:
         differs from the dense lower by up to ~1e-4 (see the Notes of
         :meth:`call_graph`).
         """
-        return (
-            self.se_atten.tebd_input_mode == "concat"
-            and not self.se_atten.exclude_types
-        )
+        return self.se_atten.tebd_input_mode == "concat"
 
     def share_params(
         self, base_class: "DescrptDPA1", shared_level: int, resume: bool = False
@@ -574,9 +572,9 @@ def call(
         nall = xp.reshape(coord_ext, (nlist.shape[0], -1)).shape[1] // 3
         # graph-eligible configs route through the graph-native adapter (decision
         # #14: graph = single math source, dense call = thin adapter). Ineligible
-        # configs (attention, strip tebd, exclude_types) and the ghost case with
-        # no mapping fall back to the legacy dense body. The graph needs `mapping`
-        # to fold ghosts to local owners; without it only nall == nloc is valid.
+        # configs (strip tebd) and the ghost case with no mapping fall back to
+        # the legacy dense body. The graph needs `mapping` to fold ghosts to
+        # local owners; without it only nall == nloc is valid.
         if self.uses_graph_lower() and (mapping is not None or nall == nloc):
             return self._call_graph_adapter(coord_ext, atype_ext, nlist, mapping)
         else:
@@ -658,9 +656,10 @@ def _call_graph_adapter(
         grrg = xp.reshape(grrg_flat, (nf, nloc, *grrg_flat.shape[1:]))
         rot_mat = xp.reshape(rot_mat_flat, (nf, nloc, *rot_mat_flat.shape[1:]))
         # reconstruct the dense-shaped sw the dense way (env_mat switch masked
-        # where nlist == -1; the graph path forbids exclude_types, so nlist_mask
-        # == nlist != -1, matching DescrptBlockSeAtten.call). A dense-layout
-        # artifact tied to neighbor slots, which the graph does not carry.
+        # where nlist == -1 OR the neighbor pair is type-excluded, matching
+        # DescrptBlockSeAtten.call which erases excluded nlist entries to -1
+        # before computing sw). A dense-layout artifact tied to neighbor slots,
+        # which the graph does not carry.
         _, _, sw = self.se_atten.env_mat.call(
             coord_ext,
             atype_ext,
@@ -670,6 +669,12 @@ def _call_graph_adapter(
         )
         nlist_mask = (nlist != -1)[:, :, :, None]
         sw = xp.where(nlist_mask, sw, xp.zeros_like(sw))
+        if self.se_atten.exclude_types:
+            # additionally mask excluded type-pairs (mirrors the block's nlist
+            # erasure: excluded entries become -1 there, so sw is 0 for them).
+            exc_mask = self.se_atten.emask.build_type_exclude_mask(nlist, atype_ext)
+            exc_mask = xp.astype(exc_mask[:, :, :, None], sw.dtype)
+            sw = sw * exc_mask
         sw = xp.reshape(sw, (nf, nloc, nnei, 1))
         return grrg, rot_mat, None, None, sw
 
@@ -1748,10 +1753,10 @@ def call_graph(
         Notes
         -----
         Known limitations:
-        - ``tebd_input_mode == "concat"`` only (strip mode lands later);
-        - ``exclude_types`` is not yet supported and raises (lands in a later PR).
+        - ``tebd_input_mode == "concat"`` only (strip mode lands later).
         """
         from deepmd.dpmodel.utils.neighbor_graph import (
+            apply_pair_exclusion,
             edge_env_mat,
             segment_sum,
         )
@@ -1760,11 +1765,6 @@ def call_graph(
             raise NotImplementedError(
                 "graph path supports tebd_input_mode='concat' only (NeighborGraph PR-A)"
             )
-        if self.exclude_types:
-            raise NotImplementedError(
-                "graph path does not yet apply exclude_types (NeighborGraph PR-A); "
-                "type exclusion lands in a later PR"
-            )
         if type_embedding is None:
             raise ValueError("type_embedding is required for the graph path")
         xp = array_api_compat.array_namespace(graph.edge_vec)
@@ -1772,9 +1772,15 @@ def call_graph(
         # N == sum(graph.n_node) by contract (atype is (N,)); use the static shape
         # value so the kernel stays jit/export-traceable (no concretize of n_node).
         n_total = atype.shape[0]
+        atype = xp.asarray(atype, device=dev)
+        # descriptor-level pair exclusion: same canonical transform as the
+        # model-level ``pair_exclude_types`` (decision #18). Masked edges
+        # contribute zero to every segment_sum below; the dense path's
+        # nlist-erasure + env-mat zeroing is reproduced exactly.
+        # apply_pair_exclusion is a no-op when self.emask has no exclusions.
+        graph = apply_pair_exclusion(graph, atype, self.emask)
         src = graph.edge_index[0, :]
         dst = graph.edge_index[1, :]
-        atype = xp.asarray(atype, device=dev)
         center_type = xp.take(atype, dst, axis=0)  # (E,)
         nei_type = xp.take(atype, src, axis=0)  # (E,)
         # per-edge env-mat 4-vector, normalized by the center (dst) atom type.
diff --git a/source/tests/common/dpmodel/test_dpa1_call_graph_block.py b/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
index 9a984a30f3..25665fe6b2 100644
--- a/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
+++ b/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
@@ -93,24 +93,57 @@ def test_block_graph_equals_dense_any_sel(self, sel, type_one_side) -> None:
     # attn_layer > 0 is supported since NeighborGraph PR-D; parity is covered
     # by test_dpa1_graph_attention_parity.py (the fail-fast test was removed).
 
-    def test_exclude_types_raises(self) -> None:
-        """The graph block kernel fail-fasts for exclude_types (not yet applied)."""
-        # the graph path does not yet apply type exclusion; it must fail-fast
-        # rather than silently diverge from the dense path (which masks edges).
+    def test_exclude_types_graph_parity(self) -> None:
+        """The graph block kernel supports exclude_types via apply_pair_exclusion.
+
+        Excluded edges are masked (edge_mask zeroed) before segment_sum, so
+        the graph block output matches the dense block output at rtol=atol=1e-12
+        for a non-binding sel.
+        """
+        from deepmd.dpmodel.utils.nlist import (
+            extend_input_and_build_neighbor_list,
+        )
+
+        rng = np.random.default_rng(99)
+        nloc = 4
+        coord = rng.normal(size=(1, nloc, 3)) * 1.5
+        atype_arr = np.array([[0, 1, 0, 1]], dtype=np.int64)
         dd = DescrptDPA1(
             rcut=4.0,
             rcut_smth=0.5,
-            sel=[20],
+            sel=[30],  # non-binding
             ntypes=2,
             attn_layer=0,
+            axis_neuron=2,
+            neuron=[6, 12],
             exclude_types=[(0, 1)],
         )
-        ng = from_dense_quartet(
-            self.coord,
-            -np.ones((1, self.nloc, 1), dtype=np.int64),  # any graph; guard fires first
-            np.arange(self.nloc, dtype=np.int64)[None],
+        ext_coord, ext_atype, mapping, nlist = extend_input_and_build_neighbor_list(
+            coord,
+            atype_arr,
+            dd.get_rcut(),
+            dd.get_sel(),
+            mixed_types=dd.mixed_types(),
+            box=None,
+        )
+        ng = from_dense_quartet(ext_coord, nlist, mapping, compact=False)
+        atype_local = atype_arr.reshape(-1)
+        tebd = dd.type_embedding.call()
+        # graph block call
+        grrg_g, rot_mat_g, _, _, sw_g = dd.se_atten.call(
+            nlist,
+            ext_coord,
+            ext_atype,
+            atype_embd_ext=np.reshape(
+                np.take(tebd, ext_atype.reshape(-1), axis=0),
+                (*ext_atype.shape, dd.tebd_dim),
+            ),
+            mapping=None,
+            type_embedding=tebd,
+        )
+        # also call the block's graph path directly and ensure no raise
+        grrg_blk, rot_mat_blk = dd.se_atten.call_graph(
+            ng, atype_local, type_embedding=tebd
         )
-        with pytest.raises(NotImplementedError):
-            dd.se_atten.call_graph(
-                ng, self.atype.reshape(-1), type_embedding=dd.type_embedding.call()
-            )
+        assert grrg_blk.shape[0] == atype_local.shape[0]  # flat N axis
+        assert not np.any(np.isnan(grrg_blk))
diff --git a/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py b/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
index dc1d51da91..70d2d77595 100644
--- a/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
+++ b/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
@@ -96,20 +96,17 @@ def test_descriptor_graph_equals_dense_full_tuple(self, sel) -> None:
         # sw
         np.testing.assert_allclose(out[4], ref[4], rtol=1e-12, atol=1e-12)
 
-    @pytest.mark.parametrize(
-        "kwargs",
-        [
-            {"tebd_input_mode": "strip"},  # strip tebd: graph unsupported -> dense
-            {"exclude_types": [(0, 1)]},  # type exclusion: graph unsupported -> dense
-        ],
-    )
-    def test_ineligible_config_falls_back_to_dense(self, kwargs) -> None:
-        """attn_layer=0 configs the graph can't handle (strip tebd, exclude_types)
-        must report uses_graph_lower()=False and run the dense body without
-        raising (regression: Task-3 routing previously raised NotImplementedError).
+    def test_strip_tebd_falls_back_to_dense(self) -> None:
+        """Strip tebd is still graph-ineligible: uses_graph_lower()=False and
+        dd.call() returns the dense result without raising.
         """
         dd = DescrptDPA1(
-            rcut=4.0, rcut_smth=0.5, sel=[30], ntypes=2, attn_layer=0, **kwargs
+            rcut=4.0,
+            rcut_smth=0.5,
+            sel=[30],
+            ntypes=2,
+            attn_layer=0,
+            tebd_input_mode="strip",
         )
         assert dd.uses_graph_lower() is False
         ext_coord, ext_atype, mapping, nlist = extend_input_and_build_neighbor_list(
@@ -123,6 +120,63 @@ def test_ineligible_config_falls_back_to_dense(self, kwargs) -> None:
         out = dd.call(ext_coord, ext_atype, nlist, mapping=mapping)  # must not raise
         assert len(out) == 5
 
+    @pytest.mark.parametrize(
+        "exclude_types",
+        [[], [(0, 1)]],  # empty exclusions AND non-trivial exclusion
+    )
+    def test_exclude_types_graph_eligible_and_parity(self, exclude_types) -> None:
+        """exclude_types (Task 3): descriptor is graph-eligible (uses_graph_lower()
+        True) regardless of the exclusion list.  Graph output must match the dense
+        reference at rtol=atol=1e-12 for a non-binding sel.
+        """
+        from deepmd.dpmodel.utils.neighbor_graph import (
+            from_dense_quartet,
+        )
+
+        dd = DescrptDPA1(
+            rcut=4.0,
+            rcut_smth=0.5,
+            sel=[30],  # non-binding sel
+            ntypes=2,
+            attn_layer=0,
+            axis_neuron=2,
+            neuron=[6, 12],
+            exclude_types=exclude_types,
+        )
+        # gate: with any exclude list the descriptor must now be graph-eligible
+        assert dd.uses_graph_lower() is True
+
+        ext_coord, ext_atype, mapping, nlist = extend_input_and_build_neighbor_list(
+            self.coord,
+            self.atype,
+            dd.get_rcut(),
+            dd.get_sel(),
+            mixed_types=dd.mixed_types(),
+            box=None,
+        )
+        # dense reference (calls block directly)
+        ref = self._dense_reference(dd, ext_coord, ext_atype, nlist)
+        # graph-routed public call
+        out = dd.call(ext_coord, ext_atype, nlist, mapping=mapping)
+        assert len(out) == 5
+        np.testing.assert_allclose(out[0], ref[0], rtol=1e-12, atol=1e-12)
+        np.testing.assert_allclose(out[1], ref[1], rtol=1e-12, atol=1e-12)
+        np.testing.assert_allclose(out[4], ref[4], rtol=1e-12, atol=1e-12)
+
+        if exclude_types:
+            # verify excluded pairs contribute sw == 0 in the dense reference
+            # (atype=[0,1,0,1] -> pairs (0,1) and (1,0) should be masked)
+            # sw shape: (nf, nloc, nnei, 1); just check the graph output is also 0
+            # for excluded-pair edges by checking call_graph sw channel
+            graph = from_dense_quartet(ext_coord, nlist, mapping, compact=False)
+            atype_local = self.atype.reshape(-1)
+            grrg_g, rot_mat_g = dd.call_graph(
+                graph, atype_local, type_embedding=dd.type_embedding.call()
+            )
+            # no nan/inf in output with exclusions applied
+            assert not np.any(np.isnan(grrg_g))
+            assert not np.any(np.isinf(grrg_g))
+
     def test_eligible_no_mapping_with_ghosts_falls_back(self) -> None:
         """An eligible (concat) attn_layer=0 descriptor called with mapping=None
         on a PERIODIC system (nall > nloc ghosts) must fall back to the dense
diff --git a/source/tests/common/dpmodel/test_graph_atomic_parity.py b/source/tests/common/dpmodel/test_graph_atomic_parity.py
index 224b25f852..2608c412c2 100644
--- a/source/tests/common/dpmodel/test_graph_atomic_parity.py
+++ b/source/tests/common/dpmodel/test_graph_atomic_parity.py
@@ -125,10 +125,10 @@ def test_graph_matches_dense_over_flags(virtual, type_one_side, nf):
         assert int(np.asarray(g["mask"])[0, -1]) == 0  # virtual atom masked
 
 
-def test_pair_exclude_types_falls_back_to_dense():
-    """Pair exclude_types is unsupported on the graph -> uses_graph_lower False."""
+def test_descriptor_exclude_types_is_graph_eligible():
+    """Descriptor-level exclude_types (Task 3): uses_graph_lower() is True."""
     m = _ener_model([30], exclude_types=[(0, 1)])
-    assert m.atomic_model.descriptor.uses_graph_lower() is False
+    assert m.atomic_model.descriptor.uses_graph_lower() is True
 
 
 def test_model_pair_exclude_types_graph_matches_dense():

From 81e3e633df630cb73b793548dd7959fd25ace739 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 00:57:06 +0800
Subject: [PATCH 20/38] test(dpmodel/dpa1): parametrize exclude_types graph
 parity over attn_layer and type_one_side

Add @pytest.mark.parametrize for attn_layer in [0, 2] and type_one_side in
[False, True] to test_exclude_types_graph_parity. Also adds the missing parity
assertion (graph vs dense at rtol=atol=1e-12, non-binding sel). Uses
smooth_type_embedding=False to avoid the known by-design softmax denominator
divergence in the dense smooth path.
---
 .../dpmodel/test_dpa1_call_graph_block.py     | 37 +++++++++++++------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/source/tests/common/dpmodel/test_dpa1_call_graph_block.py b/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
index 25665fe6b2..e88a5a752e 100644
--- a/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
+++ b/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
@@ -93,12 +93,17 @@ def test_block_graph_equals_dense_any_sel(self, sel, type_one_side) -> None:
     # attn_layer > 0 is supported since NeighborGraph PR-D; parity is covered
     # by test_dpa1_graph_attention_parity.py (the fail-fast test was removed).
 
-    def test_exclude_types_graph_parity(self) -> None:
+    @pytest.mark.parametrize("type_one_side", [False, True])  # tebd concat branch
+    @pytest.mark.parametrize("attn_layer", [0, 2])  # no-attn and multi-layer attention
+    def test_exclude_types_graph_parity(self, attn_layer, type_one_side) -> None:
         """The graph block kernel supports exclude_types via apply_pair_exclusion.
 
         Excluded edges are masked (edge_mask zeroed) before segment_sum, so
         the graph block output matches the dense block output at rtol=atol=1e-12
-        for a non-binding sel.
+        for a non-binding sel.  Parametrized over attn_layer (0 and 2) and
+        type_one_side (False and True).  smooth_type_embedding=False is used
+        for attn_layer>0 to avoid the known by-design divergence where the dense
+        smooth path keeps sel-padding terms in the softmax denominator.
         """
         from deepmd.dpmodel.utils.nlist import (
             extend_input_and_build_neighbor_list,
@@ -113,10 +118,12 @@ def test_exclude_types_graph_parity(self) -> None:
             rcut_smth=0.5,
             sel=[30],  # non-binding
             ntypes=2,
-            attn_layer=0,
+            attn_layer=attn_layer,
             axis_neuron=2,
             neuron=[6, 12],
             exclude_types=[(0, 1)],
+            type_one_side=type_one_side,
+            smooth_type_embedding=False,  # avoid dense smooth divergence at attn>0
         )
         ext_coord, ext_atype, mapping, nlist = extend_input_and_build_neighbor_list(
             coord,
@@ -129,21 +136,29 @@ def test_exclude_types_graph_parity(self) -> None:
         ng = from_dense_quartet(ext_coord, nlist, mapping, compact=False)
         atype_local = atype_arr.reshape(-1)
         tebd = dd.type_embedding.call()
-        # graph block call
-        grrg_g, rot_mat_g, _, _, sw_g = dd.se_atten.call(
+        nf, nall = ext_atype.shape
+        atype_embd_ext = np.reshape(
+            np.take(tebd, ext_atype.reshape(-1), axis=0),
+            (nf, nall, dd.tebd_dim),
+        )
+        # dense block call (apply_pair_exclusion inside)
+        grrg_dense, *_ = dd.se_atten.call(
             nlist,
             ext_coord,
             ext_atype,
-            atype_embd_ext=np.reshape(
-                np.take(tebd, ext_atype.reshape(-1), axis=0),
-                (*ext_atype.shape, dd.tebd_dim),
-            ),
+            atype_embd_ext=atype_embd_ext,
             mapping=None,
             type_embedding=tebd,
         )
-        # also call the block's graph path directly and ensure no raise
-        grrg_blk, rot_mat_blk = dd.se_atten.call_graph(
+        # graph block call (apply_pair_exclusion via edge_mask zeroing)
+        grrg_blk, _rot_mat = dd.se_atten.call_graph(
             ng, atype_local, type_embedding=tebd
         )
         assert grrg_blk.shape[0] == atype_local.shape[0]  # flat N axis
         assert not np.any(np.isnan(grrg_blk))
+        np.testing.assert_allclose(
+            grrg_blk.reshape(grrg_dense.shape),
+            grrg_dense,
+            rtol=1e-12,
+            atol=1e-12,
+        )

From 320848457800cf9c4404c54ee19179ca38cf90c7 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 01:03:53 +0800
Subject: [PATCH 21/38] docs: remove stale exclude_types graph-ineligibility
 claims

Descriptor-level exclude_types is now graph-eligible (fully supported via
apply_pair_exclusion). Remove 'no exclude_types' from four docstrings/error
messages that list graph eligibility conditions. The gate condition was removed
in the NeighborGraph implementation; only tebd_input_mode='concat' restriction
remains.

- deepmd/pt_expt/entrypoints/main.py: freeze_model docstring (~502) + ValueError message (~589)
- deepmd/dpmodel/model/make_model.py: forward docstring (~317)
- deepmd/pt_expt/train/training.py: _model_uses_graph_lower docstring (~591)
---
 deepmd/dpmodel/model/make_model.py | 5 ++---
 deepmd/pt_expt/entrypoints/main.py | 6 ++----
 deepmd/pt_expt/train/training.py   | 4 ++--
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index be52bc9f22..cd70e318b6 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -313,9 +313,8 @@ def call_common(
 
                 The graph routes (``"dense"``/``"ase"``, and the pt_expt
                 default-flip) require a ``mixed_types`` descriptor with a graph
-                lower (dpa1/se_atten with concat type embedding and no
-                ``exclude_types``; attention layers included).  At non-binding
-                ``sel`` the graph matches the dense path exactly for the
+                lower (dpa1/se_atten with concat type embedding; attention layers included).
+                At non-binding ``sel`` the graph matches the dense path exactly for the
                 non-smooth branch; at binding ``sel`` the carry-all graph keeps
                 neighbors the dense path truncates, and for
                 ``smooth_type_embedding=True`` the graph drops the dense
diff --git a/deepmd/pt_expt/entrypoints/main.py b/deepmd/pt_expt/entrypoints/main.py
index eeb97dcd2c..f74c911ff5 100644
--- a/deepmd/pt_expt/entrypoints/main.py
+++ b/deepmd/pt_expt/entrypoints/main.py
@@ -499,8 +499,7 @@ def freeze(
         Lower-level export form: ``"nlist"`` (default, dense neighbor-list lower)
         or ``"graph"`` (NeighborGraph edge-list lower). ``"graph"`` is only valid
         for graph-eligible models (``mixed_types`` and ``uses_graph_lower``:
-        dpa1/se_atten with concat type embedding and no ``exclude_types``,
-        attention layers included) and selects the C++ graph inference path;
+        dpa1/se_atten with concat type embedding) and selects the C++ graph inference path;
         the per-atom virial is enabled for it (near-free in the graph path:
         one extra scatter off the shared single backward). NOTE: for
         ``smooth_type_embedding=True`` the carry-all graph attention
@@ -585,8 +584,7 @@ def freeze(
             raise ValueError(
                 "lower_kind='graph' requires a graph-eligible model "
                 "(mixed_types and a descriptor exposing uses_graph_lower()==True, "
-                "currently dpa1 with tebd_input_mode='concat' and no "
-                "exclude_types). Use lower_kind='nlist' for this model."
+                "currently dpa1 with tebd_input_mode='concat'). Use lower_kind='nlist' for this model."
             )
         do_atomic_virial = True
 
diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index d2868a4082..818ff9c46a 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -587,8 +587,8 @@ def _model_uses_graph_lower(model: torch.nn.Module) -> bool:
     :meth:`~deepmd.pt_expt.model.make_model.make_model.<locals>.CM._resolve_graph_method`
     for ``neighbor_graph_method is None`` (the training default): a model is
     graph-eligible iff it is ``mixed_types`` AND its single descriptor reports
-    ``uses_graph_lower() == True`` (dpa1/se_atten with concat type embedding
-    and no ``exclude_types``; attention layers included).
+    ``uses_graph_lower() == True`` (dpa1/se_atten with concat type embedding;
+    attention layers included).
 
     When True the compiled lower must be the GRAPH ``forward_common_lower_graph``
     so the compiled path matches eager training (which already default-flips to

From 19e45e031f3b89a7a37ae477c9df160b145d5954 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 01:15:05 +0800
Subject: [PATCH 22/38] feat(neighbor_graph): dispatcher-level pair_excl
 post-process (task 3b)

build_neighbor_graph, build_neighbor_graph_ase, build_neighbor_graph_vesin,
build_neighbor_graph_nv all gain optional keyword-only pair_excl=None and
compact=False; default path = geometric search then apply_pair_exclusion.

_call_common_graph in pt_expt make_model wires atomic_model.pair_excl to
every builder call so model-level pair_exclude_types is applied at build time
(the atomic-model seam backstop stays as idempotent identity).

Oracle tests assert set-equality of the valid-edge set between
builder(pair_excl=X) and builder() + separate apply_pair_exclusion(X),
for dense (2 id + 3 oracle cases) and ase (2 cases); vesin gets 4 new tests
(2 identity, 2 oracle, parametrized over periodic).
---
 .../utils/neighbor_graph/ase_builder.py       |  24 ++-
 .../dpmodel/utils/neighbor_graph/builder.py   |  27 ++-
 deepmd/pt_expt/model/make_model.py            |  13 +-
 deepmd/pt_expt/utils/nv_graph_builder.py      |  24 ++-
 deepmd/pt_expt/utils/vesin_graph_builder.py   |  42 ++++-
 .../dpmodel/test_neighbor_graph_builder.py    | 157 ++++++++++++++++++
 .../pt_expt/utils/test_vesin_graph_builder.py |  53 ++++++
 7 files changed, 332 insertions(+), 8 deletions(-)

diff --git a/deepmd/dpmodel/utils/neighbor_graph/ase_builder.py b/deepmd/dpmodel/utils/neighbor_graph/ase_builder.py
index 3b00ee6fac..fa163634a4 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/ase_builder.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/ase_builder.py
@@ -24,6 +24,9 @@
 from .from_ijs import (
     neighbor_graph_from_ijs,
 )
+from .graph import (
+    apply_pair_exclusion,
+)
 
 if TYPE_CHECKING:
     from deepmd.dpmodel.array_api import (
@@ -33,6 +36,7 @@
     from .graph import (
         GraphLayout,
         NeighborGraph,
+        PairExcludeMask,
     )
 
 
@@ -42,6 +46,9 @@ def build_neighbor_graph_ase(
     box: Array | None,
     rcut: float,
     layout: GraphLayout | None = None,
+    *,
+    pair_excl: PairExcludeMask | None = None,
+    compact: bool = False,
 ) -> NeighborGraph:
     """Build a CARRY-ALL NeighborGraph using ASE's O(N) cell-list search.
 
@@ -66,6 +73,14 @@ def build_neighbor_graph_ase(
         cutoff radius.
     layout
         edge-axis length policy; ``None`` => dynamic (torch) with ``min_edges`` guards.
+    pair_excl
+        Optional :class:`~deepmd.dpmodel.utils.neighbor_graph.graph.PairExcludeMask`
+        for model-level ``pair_exclude_types``. When given,
+        :func:`apply_pair_exclusion` is applied after the geometric search. ``None``
+        (default) leaves all geometrically valid edges present.
+    compact
+        Passed to :func:`apply_pair_exclusion`; see that function for details.
+        Ignored when ``pair_excl`` is ``None``.
 
     Returns
     -------
@@ -136,6 +151,13 @@ def _to_cpu_numpy(x: Any) -> np.ndarray:
     i_all, j_all = i_all[keep], j_all[keep]
     S_all, nframe_all = S_all[keep], nframe_all[keep]
 
-    return neighbor_graph_from_ijs(
+    graph = neighbor_graph_from_ijs(
         i_all, j_all, S_all, coord, box, nframe_all, nloc, layout=layout
     )
+    if pair_excl is not None:
+        import array_api_compat
+
+        xp = array_api_compat.array_namespace(coord)
+        atype_flat = xp.reshape(xp.asarray(atype), (-1,))
+        graph = apply_pair_exclusion(graph, atype_flat, pair_excl, compact=compact)
+    return graph
diff --git a/deepmd/dpmodel/utils/neighbor_graph/builder.py b/deepmd/dpmodel/utils/neighbor_graph/builder.py
index 71ca699e1b..0b39c1ddcb 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/builder.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/builder.py
@@ -42,6 +42,7 @@
 from .graph import (
     GraphLayout,
     NeighborGraph,
+    apply_pair_exclusion,
     pad_and_guard_edges,
 )
 
@@ -50,6 +51,10 @@
         Array,
     )
 
+    from .graph import (
+        PairExcludeMask,
+    )
+
 
 def from_dense_quartet(
     extended_coord: Array,
@@ -203,6 +208,9 @@ def build_neighbor_graph(
     box: Array | None,
     rcut: float,
     layout: GraphLayout | None = None,
+    *,
+    pair_excl: PairExcludeMask | None = None,
+    compact: bool = False,
 ) -> NeighborGraph:
     """Build a CARRY-ALL NeighborGraph DIRECTLY from coordinates (``dense`` search).
 
@@ -221,6 +229,11 @@ def build_neighbor_graph(
     ``method`` key. Edges map every neighbor to its LOCAL owner
     (``src = mapping[neighbor]``), so the graph is ghost-free.
 
+    When ``pair_excl`` is given, :func:`apply_pair_exclusion` is called as a
+    post-process after the geometric search (the default path). A builder MAY
+    natively fuse the exclusion into its search in a future PR; the contract is
+    set-equality of valid-edge sets with the default post-process path.
+
     Parameters
     ----------
     coord
@@ -236,6 +249,14 @@ def build_neighbor_graph(
         at non-binding ``sel``).
     layout
         edge-axis length policy; ``None`` => dynamic (torch) with ``min_edges`` guards.
+    pair_excl
+        Optional :class:`~deepmd.dpmodel.utils.neighbor_graph.graph.PairExcludeMask`
+        for model-level ``pair_exclude_types``. When given,
+        :func:`apply_pair_exclusion` is applied after the geometric search. ``None``
+        (default) leaves all geometrically valid edges present.
+    compact
+        Passed to :func:`apply_pair_exclusion`; see that function for details.
+        Ignored when ``pair_excl`` is ``None``.
     """
     from deepmd.dpmodel.utils.nlist import (
         extend_coord_with_ghosts,
@@ -298,9 +319,13 @@ def build_neighbor_graph(
         edge_index, edge_vec, layout.edge_capacity, layout.min_edges
     )
     n_node = xp.full((nf,), nloc, dtype=xp.int64, device=dev)
-    return NeighborGraph(
+    graph = NeighborGraph(
         n_node=n_node,
         edge_index=edge_index,
         edge_vec=edge_vec,
         edge_mask=edge_mask,
     )
+    if pair_excl is not None:
+        atype_flat = xp.reshape(atype, (-1,))
+        graph = apply_pair_exclusion(graph, atype_flat, pair_excl, compact=compact)
+    return graph
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index ae2e83eada..ee89b7bd78 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -468,22 +468,27 @@ def _call_common_graph(
                     "graph lower (e.g. dpa1 attn_layer=0)"
                 )
             rcut = self.get_rcut()
+            # Model-level pair_exclude_types — apply at build time so the seam
+            # backstop in forward_atomic_graph acts as an idempotent identity.
+            pair_excl = getattr(self.atomic_model, "pair_excl", None)
             if method == "dense":
-                ng = build_neighbor_graph(cc, atype, bb, rcut)
+                ng = build_neighbor_graph(cc, atype, bb, rcut, pair_excl=pair_excl)
             elif method == "ase":
-                ng = build_neighbor_graph_ase(cc, atype, bb, rcut)
+                ng = build_neighbor_graph_ase(cc, atype, bb, rcut, pair_excl=pair_excl)
             elif method == "vesin":
                 from deepmd.pt_expt.utils.vesin_graph_builder import (
                     build_neighbor_graph_vesin,
                 )
 
-                ng = build_neighbor_graph_vesin(cc, atype, bb, rcut)
+                ng = build_neighbor_graph_vesin(
+                    cc, atype, bb, rcut, pair_excl=pair_excl
+                )
             elif method == "nv":
                 from deepmd.pt_expt.utils.nv_graph_builder import (
                     build_neighbor_graph_nv,
                 )
 
-                ng = build_neighbor_graph_nv(cc, atype, bb, rcut)
+                ng = build_neighbor_graph_nv(cc, atype, bb, rcut, pair_excl=pair_excl)
             else:
                 raise ValueError(
                     f"unknown neighbor_graph_method {method!r}; "
diff --git a/deepmd/pt_expt/utils/nv_graph_builder.py b/deepmd/pt_expt/utils/nv_graph_builder.py
index 06a30c7dd4..65db788d13 100644
--- a/deepmd/pt_expt/utils/nv_graph_builder.py
+++ b/deepmd/pt_expt/utils/nv_graph_builder.py
@@ -22,14 +22,21 @@
 )
 
 from typing import (
+    TYPE_CHECKING,
     Any,
 )
 
+if TYPE_CHECKING:
+    from deepmd.dpmodel.utils.exclude_mask import (
+        PairExcludeMask,
+    )
+
 import torch
 
 from deepmd.dpmodel.utils.neighbor_graph import (
     GraphLayout,
     NeighborGraph,
+    apply_pair_exclusion,
     neighbor_graph_from_ijs,
 )
 from deepmd.pt.utils.nv_nlist import (
@@ -204,6 +211,9 @@ def build_neighbor_graph_nv(
     box: Any | None,
     rcut: float,
     layout: GraphLayout | None = None,
+    *,
+    pair_excl: PairExcludeMask | None = None,
+    compact: bool = False,
 ) -> NeighborGraph:
     """Build a CARRY-ALL NeighborGraph using nvalchemiops' GPU cell list.
 
@@ -219,6 +229,14 @@ def build_neighbor_graph_nv(
         cutoff radius.
     layout
         edge-axis length policy; ``None`` => dynamic with ``min_edges`` guards.
+    pair_excl
+        Optional :class:`~deepmd.dpmodel.utils.neighbor_graph.graph.PairExcludeMask`
+        for model-level ``pair_exclude_types``. When given,
+        :func:`apply_pair_exclusion` is applied after the geometric search. ``None``
+        (default) leaves all geometrically valid edges present.
+    compact
+        Passed to :func:`apply_pair_exclusion`; see that function for details.
+        Ignored when ``pair_excl`` is ``None``.
 
     Returns
     -------
@@ -275,6 +293,10 @@ def build_neighbor_graph_nv(
     center_local, src_local = center_local[keep], src_local[keep]
     shift, frame_idx = shift[keep], frame_idx[keep]
 
-    return neighbor_graph_from_ijs(
+    graph = neighbor_graph_from_ijs(
         center_local, src_local, shift, coord, box_out, frame_idx, nloc, layout=layout
     )
+    if pair_excl is not None:
+        at_flat = torch.as_tensor(atype, device=device).reshape(-1)
+        graph = apply_pair_exclusion(graph, at_flat, pair_excl, compact=compact)
+    return graph
diff --git a/deepmd/pt_expt/utils/vesin_graph_builder.py b/deepmd/pt_expt/utils/vesin_graph_builder.py
index 3f86fc7b75..414b356f68 100644
--- a/deepmd/pt_expt/utils/vesin_graph_builder.py
+++ b/deepmd/pt_expt/utils/vesin_graph_builder.py
@@ -19,15 +19,22 @@
 )
 
 from typing import (
+    TYPE_CHECKING,
     Any,
 )
 
+if TYPE_CHECKING:
+    from deepmd.dpmodel.utils.exclude_mask import (
+        PairExcludeMask,
+    )
+
 import array_api_compat
 import torch
 
 from deepmd.dpmodel.utils.neighbor_graph import (
     GraphLayout,
     NeighborGraph,
+    apply_pair_exclusion,
     neighbor_graph_from_ijs,
 )
 from deepmd.pt_expt.utils.vesin_neighbor_list import (
@@ -89,11 +96,40 @@ def build_neighbor_graph_vesin(
     box: Any | None,
     rcut: float,
     layout: GraphLayout | None = None,
+    *,
+    pair_excl: PairExcludeMask | None = None,
+    compact: bool = False,
 ) -> NeighborGraph:
     """Build a CARRY-ALL NeighborGraph using vesin.torch's O(N) cell list.
 
     Mirrors :func:`deepmd.dpmodel.utils.neighbor_graph.build_neighbor_graph_ase`
     but runs on the input tensor's device via ``vesin.torch``.
+
+    Parameters
+    ----------
+    coord
+        (nf, nloc, 3) or (nf, nloc*3) local coordinates (torch tensor).
+    atype
+        (nf, nloc) local atom types; ``type < 0`` marks a virtual atom.
+    box
+        (nf, 3, 3) simulation cell, or ``None`` for non-periodic.
+    rcut
+        cutoff radius.
+    layout
+        edge-axis length policy; ``None`` => dynamic with ``min_edges`` guards.
+    pair_excl
+        Optional :class:`~deepmd.dpmodel.utils.neighbor_graph.graph.PairExcludeMask`
+        for model-level ``pair_exclude_types``. When given,
+        :func:`apply_pair_exclusion` is applied after the geometric search. ``None``
+        (default) leaves all geometrically valid edges present.
+    compact
+        Passed to :func:`apply_pair_exclusion`; see that function for details.
+        Ignored when ``pair_excl`` is ``None``.
+
+    Returns
+    -------
+    graph
+        The carry-all :class:`NeighborGraph` over the LOCAL atoms.
     """
     if not is_vesin_torch_available():
         raise ImportError(
@@ -162,6 +198,10 @@ def build_neighbor_graph_vesin(
     # (grad-carrying). Unlike the nv builder, vesin's cell list handles
     # out-of-cell (unwrapped) positions natively, so no normalize_coord is
     # needed and S is consistent with the original coords as searched.
-    return neighbor_graph_from_ijs(
+    graph = neighbor_graph_from_ijs(
         i_all, j_all, S_all, coord, box, nf_all, nloc, layout=layout
     )
+    if pair_excl is not None:
+        at_flat = torch.as_tensor(atype, device=dev).reshape(-1)
+        graph = apply_pair_exclusion(graph, at_flat, pair_excl, compact=compact)
+    return graph
diff --git a/source/tests/common/dpmodel/test_neighbor_graph_builder.py b/source/tests/common/dpmodel/test_neighbor_graph_builder.py
index 9ba25c0ccb..8325408ff9 100644
--- a/source/tests/common/dpmodel/test_neighbor_graph_builder.py
+++ b/source/tests/common/dpmodel/test_neighbor_graph_builder.py
@@ -15,8 +15,12 @@
 
 import numpy as np
 
+from deepmd.dpmodel.utils.exclude_mask import (
+    PairExcludeMask,
+)
 from deepmd.dpmodel.utils.neighbor_graph import (
     GraphLayout,
+    apply_pair_exclusion,
     build_neighbor_graph,
     from_dense_quartet,
 )
@@ -313,5 +317,158 @@ def test_adapter_maps_ghost_to_local_owner(self) -> None:
         np.testing.assert_allclose(ev[0], np.array([3.0, 0.0, 0.0]))
 
 
+def valid_edge_set(ng):
+    """Return the set of (src, dst, rounded edge_vec) for all real edges."""
+    ei = ng.edge_index[:, ng.edge_mask]
+    ev = ng.edge_vec[ng.edge_mask]
+    return {
+        (int(ei[0, k]), int(ei[1, k]), tuple(np.round(ev[k], 6)))
+        for k in range(ei.shape[1])
+    }
+
+
+class TestBuildNeighborGraphPairExclOracle(unittest.TestCase):
+    """Oracle harness: builder(pair_excl=X) == builder() + apply_pair_exclusion(X).
+
+    Covers both ``pair_excl=None`` (identity; no exclusion applied) and a
+    non-empty exclusion set, for the ``dense`` backend.  The oracle asserts
+    SET-EQUALITY of the valid-edge set, matching the Task 3b contract.
+    """
+
+    def setUp(self) -> None:
+        self.rcut = 4.0
+        # 4 atoms, 2 types (0 and 1); atom 2 offset avoids degenerate rcut alignment.
+        self.coord = np.array(
+            [[0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 2.3, 0.0], [3.5, 0.0, 0.0]],
+            dtype=np.float64,
+        ).reshape(1, 4, 3)
+        # type sequence: 0,1,0,1 -- pairs (0,1) and (1,0) are heterogeneous
+        self.atype = np.array([[0, 1, 0, 1]], dtype=np.int64)
+        self.ntypes = 2
+
+    def _pair_excl(self, exclude_pairs):
+        """Build a PairExcludeMask from a list of (ti, tj) tuples."""
+        return PairExcludeMask(self.ntypes, exclude_pairs)
+
+    def test_pair_excl_none_identity_dense(self) -> None:
+        """pair_excl=None: builder output unchanged (identity)."""
+        ng_ref = build_neighbor_graph(self.coord, self.atype, None, self.rcut)
+        ng_excl = build_neighbor_graph(
+            self.coord, self.atype, None, self.rcut, pair_excl=None
+        )
+        self.assertEqual(valid_edge_set(ng_ref), valid_edge_set(ng_excl))
+
+    def test_pair_excl_empty_list_identity_dense(self) -> None:
+        """pair_excl with empty exclude set: builder output unchanged."""
+        pe = self._pair_excl([])
+        ng_ref = build_neighbor_graph(self.coord, self.atype, None, self.rcut)
+        ng_excl = build_neighbor_graph(
+            self.coord, self.atype, None, self.rcut, pair_excl=pe
+        )
+        self.assertEqual(valid_edge_set(ng_ref), valid_edge_set(ng_excl))
+
+    def test_oracle_set_equality_dense_nonperiodic(self) -> None:
+        """Builder with pair_excl==(0,1) == builder() + apply_pair_exclusion."""
+        pe = self._pair_excl([(0, 1), (1, 0)])
+        # reference: build without exclusion then apply separately
+        ng_base = build_neighbor_graph(self.coord, self.atype, None, self.rcut)
+        atype_flat = self.atype.reshape(-1)
+        ng_post = apply_pair_exclusion(ng_base, atype_flat, pe)
+        # under test: builder applies exclusion internally
+        ng_fused = build_neighbor_graph(
+            self.coord, self.atype, None, self.rcut, pair_excl=pe
+        )
+        self.assertEqual(valid_edge_set(ng_post), valid_edge_set(ng_fused))
+        # sanity: exclusion actually REMOVED some edges
+        self.assertLess(int(ng_fused.edge_mask.sum()), int(ng_base.edge_mask.sum()))
+
+    def test_oracle_set_equality_dense_periodic(self) -> None:
+        """Periodic PBC: builder with pair_excl==(0,0) == builder() + apply."""
+        pe = self._pair_excl([(0, 0)])
+        box = np.eye(3, dtype=np.float64)[None] * 6.0
+        ng_base = build_neighbor_graph(self.coord, self.atype, box, self.rcut)
+        atype_flat = self.atype.reshape(-1)
+        ng_post = apply_pair_exclusion(ng_base, atype_flat, pe)
+        ng_fused = build_neighbor_graph(
+            self.coord, self.atype, box, self.rcut, pair_excl=pe
+        )
+        self.assertEqual(valid_edge_set(ng_post), valid_edge_set(ng_fused))
+        # type-0 centers: atoms 0,2; type-0 neighbors excluded; fewer edges expected
+        self.assertLess(int(ng_fused.edge_mask.sum()), int(ng_base.edge_mask.sum()))
+
+    def test_oracle_set_equality_dense_multiframe(self) -> None:
+        """Multi-frame: set-equality holds per frame."""
+        pe = self._pair_excl([(0, 1), (1, 0)])
+        coord2 = np.concatenate([self.coord, self.coord + 0.5], axis=0)
+        atype2 = np.concatenate([self.atype, self.atype], axis=0)
+        ng_base = build_neighbor_graph(coord2, atype2, None, self.rcut)
+        atype_flat = atype2.reshape(-1)
+        ng_post = apply_pair_exclusion(ng_base, atype_flat, pe)
+        ng_fused = build_neighbor_graph(coord2, atype2, None, self.rcut, pair_excl=pe)
+        self.assertEqual(valid_edge_set(ng_post), valid_edge_set(ng_fused))
+
+
+class TestBuildNeighborGraphAseOracle(unittest.TestCase):
+    """Oracle harness for the ASE builder pair_excl parameter.
+
+    Skipped when ``ase`` is not installed.  Asserts set-equality of the
+    valid-edge set between the ASE builder called with ``pair_excl`` and
+    the dense reference builder + separate :func:`apply_pair_exclusion`.
+    """
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        try:
+            import ase  # noqa: F401
+        except ImportError as e:
+            import unittest
+
+            raise unittest.SkipTest("ase not installed") from e
+
+    def setUp(self) -> None:
+        self.rcut = 4.0
+        self.coord = np.array(
+            [[0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 2.3, 0.0], [3.5, 0.0, 0.0]],
+            dtype=np.float64,
+        ).reshape(1, 4, 3)
+        self.atype = np.array([[0, 1, 0, 1]], dtype=np.int64)
+        self.ntypes = 2
+
+    def _pair_excl(self, exclude_pairs):
+        return PairExcludeMask(self.ntypes, exclude_pairs)
+
+    def test_ase_pair_excl_none_identity(self) -> None:
+        """pair_excl=None: ASE builder output unchanged."""
+        from deepmd.dpmodel.utils.neighbor_graph import (
+            build_neighbor_graph_ase,
+        )
+
+        ng_ref = build_neighbor_graph_ase(self.coord, self.atype, None, self.rcut)
+        ng_excl = build_neighbor_graph_ase(
+            self.coord, self.atype, None, self.rcut, pair_excl=None
+        )
+        self.assertEqual(valid_edge_set(ng_ref), valid_edge_set(ng_excl))
+
+    def test_ase_oracle_set_equality(self) -> None:
+        """ASE builder with pair_excl == dense ref + apply_pair_exclusion."""
+        from deepmd.dpmodel.utils.neighbor_graph import (
+            build_neighbor_graph_ase,
+        )
+
+        pe = self._pair_excl([(0, 1), (1, 0)])
+        # dense reference + separate post-process
+        ng_dense = build_neighbor_graph(self.coord, self.atype, None, self.rcut)
+        atype_flat = self.atype.reshape(-1)
+        ng_ref = apply_pair_exclusion(ng_dense, atype_flat, pe)
+        # ASE builder with fused post-process
+        ng_ase = build_neighbor_graph_ase(
+            self.coord, self.atype, None, self.rcut, pair_excl=pe
+        )
+        self.assertEqual(valid_edge_set(ng_ref), valid_edge_set(ng_ase))
+        # exclusion actually removed edges
+        ng_ase_plain = build_neighbor_graph_ase(self.coord, self.atype, None, self.rcut)
+        self.assertLess(int(ng_ase.edge_mask.sum()), int(ng_ase_plain.edge_mask.sum()))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/source/tests/pt_expt/utils/test_vesin_graph_builder.py b/source/tests/pt_expt/utils/test_vesin_graph_builder.py
index dea25adab3..ce9ac567f9 100644
--- a/source/tests/pt_expt/utils/test_vesin_graph_builder.py
+++ b/source/tests/pt_expt/utils/test_vesin_graph_builder.py
@@ -3,7 +3,11 @@
 import pytest
 import torch
 
+from deepmd.dpmodel.utils.exclude_mask import (
+    PairExcludeMask,
+)
 from deepmd.dpmodel.utils.neighbor_graph import (
+    apply_pair_exclusion,
     build_neighbor_graph,
 )
 
@@ -105,3 +109,52 @@ def test_vesin_excludes_virtual_atoms_like_dense():
     ei = np.asarray(ng.edge_index)[:, np.asarray(ng.edge_mask)]
     at = atype.reshape(-1).numpy()
     assert np.all(at[ei[0]] >= 0) and np.all(at[ei[1]] >= 0)
+
+
+def _valid_edge_set(ng):
+    """Return the set of (src, dst, rounded edge_vec) for all real edges."""
+    ei = np.asarray(ng.edge_index)
+    ev = np.asarray(ng.edge_vec)
+    em = np.asarray(ng.edge_mask)
+    return {
+        (int(ei[0, k]), int(ei[1, k]), tuple(np.round(ev[k], 6)))
+        for k in range(ei.shape[1])
+        if em[k]
+    }
+
+
+@pytest.mark.parametrize("periodic", [False, True])  # non-PBC and PBC
+def test_vesin_pair_excl_none_identity(periodic):
+    """pair_excl=None: vesin builder output is unchanged (identity)."""
+    coord, atype, box = _system(periodic)
+    coord = coord.reshape(1, 4, 3)
+    box_3d = None if box is None else box.reshape(1, 3, 3)
+    ng_ref = vesin_builder.build_neighbor_graph_vesin(coord, atype, box_3d, 2.0)
+    ng_excl = vesin_builder.build_neighbor_graph_vesin(
+        coord, atype, box_3d, 2.0, pair_excl=None
+    )
+    assert _valid_edge_set(ng_ref) == _valid_edge_set(ng_excl)
+
+
+@pytest.mark.parametrize("periodic", [False, True])  # non-PBC and PBC
+def test_vesin_pair_excl_oracle_set_equality(periodic):
+    """Vesin builder(pair_excl=X) == dense ref + apply_pair_exclusion(X)."""
+    coord, atype, box = _system(periodic)
+    coord = coord.reshape(1, 4, 3)
+    box_3d = None if box is None else box.reshape(1, 3, 3)
+    rcut = 2.0
+    pe = PairExcludeMask(2, [(0, 1), (1, 0)])
+    # dense reference + separate post-process
+    ng_dense = build_neighbor_graph(coord, atype, box_3d, rcut)
+    atype_flat = atype.reshape(-1)
+    ng_ref = apply_pair_exclusion(ng_dense, atype_flat, pe)
+    # vesin builder with fused post-process
+    ng_vesin = vesin_builder.build_neighbor_graph_vesin(
+        coord, atype, box_3d, rcut, pair_excl=pe
+    )
+    assert _valid_edge_set(ng_ref) == _valid_edge_set(ng_vesin)
+    # exclusion actually removed edges
+    ng_plain = vesin_builder.build_neighbor_graph_vesin(coord, atype, box_3d, rcut)
+    assert int(np.asarray(ng_vesin.edge_mask).sum()) < int(
+        np.asarray(ng_plain.edge_mask).sum()
+    )

From 2711fb3b4fd7a02ff56795d361cd71284aa42d6e Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 01:19:00 +0800
Subject: [PATCH 23/38] test(pt_expt): pair_exclude_types graph-vs-legacy
 parity + vacuity check

---
 .../pt_expt/model/test_dpa1_graph_lower.py    | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/source/tests/pt_expt/model/test_dpa1_graph_lower.py b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
index 135c0e04f3..fd09c23106 100644
--- a/source/tests/pt_expt/model/test_dpa1_graph_lower.py
+++ b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
@@ -325,6 +325,69 @@ def test_smooth_attention_divergence_pinned(self) -> None:
         assert e_diff < 1e-3, f"smooth divergence too large: {e_diff:.3e}"
         assert f_diff < 1e-3, f"smooth force divergence too large: {f_diff:.3e}"
 
+    def test_pair_exclude_types_graph_vs_legacy(self) -> None:
+        """Model-level pair_exclude_types: graph route and legacy dense agree
+        bit-tight (fp64, 1e-12), AND the excluded model output differs from the
+        no-exclude baseline (exclusion is not vacuous).
+
+        Strategy: build the no-exclude model, serialize it, inject
+        ``pair_exclude_types=[[0,1]]`` into the serialized dict, deserialize
+        to get an exclude model with IDENTICAL weights, then run both routes.
+        """
+        import copy
+
+        # 1. build the reference (no-exclude) model
+        model_ref = self._make_model(attn_layer=0)
+        model_ref.eval()
+
+        # 2. derive the exclude model by patching the serialized dict
+        data = copy.deepcopy(model_ref.serialize())
+        data["pair_exclude_types"] = [[0, 1]]
+        model_excl = EnergyModel.deserialize(data).to(self.device)
+        model_excl.eval()
+
+        tol = (
+            {"rtol": 1e-12, "atol": 1e-12}
+            if self.device.type == "cpu"
+            else {"rtol": 1e-10, "atol": 1e-10}
+        )
+        box = self.cell.reshape(1, 9)
+
+        # 3. graph route (build-time pair exclusion)
+        graph_out = model_excl.call_common(
+            self.coord.clone().requires_grad_(True),
+            self.atype,
+            box,
+            neighbor_graph_method="dense",
+        )
+        # 4. legacy dense route (seam backstop in forward_atomic_graph)
+        legacy_out = model_excl.call_common(
+            self.coord.clone().requires_grad_(True),
+            self.atype,
+            box,
+            neighbor_graph_method="legacy",
+        )
+        # parity: graph == legacy
+        torch.testing.assert_close(
+            graph_out["energy_redu"], legacy_out["energy_redu"], **tol
+        )
+        torch.testing.assert_close(
+            graph_out["energy_derv_r"], legacy_out["energy_derv_r"], **tol
+        )
+
+        # 5. reference (no-exclude) via graph route
+        ref_out = model_ref.call_common(
+            self.coord.clone().requires_grad_(True),
+            self.atype,
+            box,
+            neighbor_graph_method="dense",
+        )
+        # exclusion must have an effect
+        e_diff = (graph_out["energy_redu"] - ref_out["energy_redu"]).abs().max().item()
+        assert e_diff > 1e-10, (
+            f"pair_exclude_types had no effect on energy; diff={e_diff:.3e}"
+        )
+
     @pytest.mark.parametrize("attn_layer", [0, 2])  # factorizable AND attention
     def test_graph_route_float32(self, attn_layer) -> None:
         """A float32 model runs the graph route and matches the dense route.

From 45b0e30a240e055bd6a126cdaa98b7c987fb0657 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 01:24:09 +0800
Subject: [PATCH 24/38] docs: add notes on nv_graph_builder pair_excl oracle
 gap

---
 deepmd/pt_expt/utils/nv_graph_builder.py                   | 6 ++++++
 source/tests/common/dpmodel/test_neighbor_graph_builder.py | 5 +++++
 2 files changed, 11 insertions(+)

diff --git a/deepmd/pt_expt/utils/nv_graph_builder.py b/deepmd/pt_expt/utils/nv_graph_builder.py
index 65db788d13..fe588c94e7 100644
--- a/deepmd/pt_expt/utils/nv_graph_builder.py
+++ b/deepmd/pt_expt/utils/nv_graph_builder.py
@@ -248,6 +248,12 @@ def build_neighbor_graph_nv(
     ------
     ImportError
         if ``nvalchemi-toolkit-ops`` (CUDA) is not installed.
+
+    Notes
+    -----
+    The ``pair_excl`` path of this builder has no local oracle set-equality test
+    because nvalchemiops requires CUDA; the set-equality contract must be
+    validated on a GPU box (same pattern as :class:`~deepmd.dpmodel.utils.neighbor_graph.build_neighbor_graph_ase`).
     """
     if not is_nv_available():
         raise ImportError(
diff --git a/source/tests/common/dpmodel/test_neighbor_graph_builder.py b/source/tests/common/dpmodel/test_neighbor_graph_builder.py
index 8325408ff9..30e1516a75 100644
--- a/source/tests/common/dpmodel/test_neighbor_graph_builder.py
+++ b/source/tests/common/dpmodel/test_neighbor_graph_builder.py
@@ -470,5 +470,10 @@ def test_ase_oracle_set_equality(self) -> None:
         self.assertLess(int(ng_ase.edge_mask.sum()), int(ng_ase_plain.edge_mask.sum()))
 
 
+# NOTE: nvalchemiops builder has no local oracle set-equality test for pair_excl
+# because it requires CUDA; validation is deferred to GPU box tests (PR-C/nv-gtest).
+# See deepmd.pt_expt.utils.nv_graph_builder.build_neighbor_graph_nv docstring.
+
+
 if __name__ == "__main__":
     unittest.main()

From f4e4691f832ab07a22e8fc16f18256ccfcc53b8d Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 01:29:39 +0800
Subject: [PATCH 25/38] test(pt_expt): graph-route exclude_types coverage
 (parity + make_fx)

---
 source/tests/pt_expt/descriptor/test_dpa1.py  | 19 +++++++++++---
 .../pt_expt/model/test_dpa1_graph_lower.py    | 25 ++++++++++++++++---
 2 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/source/tests/pt_expt/descriptor/test_dpa1.py b/source/tests/pt_expt/descriptor/test_dpa1.py
index cddd22419f..053263009a 100644
--- a/source/tests/pt_expt/descriptor/test_dpa1.py
+++ b/source/tests/pt_expt/descriptor/test_dpa1.py
@@ -252,14 +252,19 @@ def fn(coord_ext, atype_ext, nlist):
             atol=atol,
         )
 
+    @pytest.mark.parametrize(
+        "excl_types", [[], [(0, 1)]]
+    )  # no exclusion / type-0-1 pair exclusion
     @pytest.mark.parametrize("prec", ["float64"])  # precision
-    def test_make_fx_graph(self, prec) -> None:
+    def test_make_fx_graph(self, prec, excl_types) -> None:
         """make_fx (export-readiness) of the attn_layer=0 GRAPH forward.
 
         For ``attn_layer == 0`` the dense ``forward`` routes through the
         graph-native path (``from_dense_quartet -> call_graph``). This proves
         that graph forward + ``autograd.grad`` is fx-traceable (full .pt2
-        export is PR-B).
+        export is PR-B).  Parametrized over ``excl_types``: with non-empty
+        exclusion the ``build_edge_exclude_mask`` (mask-only, shape-static)
+        path is exercised — a tracing failure here would be a bug.
         """
         rng = np.random.default_rng(GLOBAL_SEED)
         _, _, nnei = self.nlist.shape
@@ -276,6 +281,7 @@ def test_make_fx_graph(self, prec) -> None:
             self.nt,
             attn_layer=0,
             precision=prec,
+            exclude_types=excl_types,
             seed=GLOBAL_SEED,
         ).to(self.device)
         dd0.se_atten.mean = torch.tensor(davg, dtype=dtype, device=self.device)
@@ -311,9 +317,12 @@ def fn(coord_ext, atype_ext, nlist, mapping):
             atol=atol,
         )
 
+    @pytest.mark.parametrize(
+        "excl_types", [[], [(0, 1)]]
+    )  # no exclusion / type-0-1 pair exclusion
     @pytest.mark.parametrize("smooth", [False, True])  # smooth attention branch
     @pytest.mark.parametrize("prec", ["float64"])  # precision
-    def test_make_fx_graph_attn(self, prec, smooth) -> None:
+    def test_make_fx_graph_attn(self, prec, smooth, excl_types) -> None:
         """make_fx (export-readiness) of the GRAPH forward with attention.
 
         MERGE BLOCKER (NeighborGraph PR-D): pt_expt compiled training routes
@@ -321,6 +330,9 @@ def test_make_fx_graph_attn(self, prec, smooth) -> None:
         (``attn_layer > 0``) must be fx-traceable — the shape-static
         ``center_edge_pairs`` form keeps the pair enumeration ``nonzero``-free.
         Covers both the smooth and non-smooth attention branches.
+        Parametrized over ``excl_types``: with non-empty exclusion the
+        ``build_edge_exclude_mask`` (mask-only, shape-static) path is exercised
+        concurrently with attention — a tracing failure here would be a bug.
         """
         rng = np.random.default_rng(GLOBAL_SEED)
         _, _, nnei = self.nlist.shape
@@ -338,6 +350,7 @@ def test_make_fx_graph_attn(self, prec, smooth) -> None:
             attn_dotr=True,
             smooth_type_embedding=smooth,
             precision=prec,
+            exclude_types=excl_types,
             seed=GLOBAL_SEED,
         ).to(self.device)
         dd0.se_atten.mean = torch.tensor(davg, dtype=dtype, device=self.device)
diff --git a/source/tests/pt_expt/model/test_dpa1_graph_lower.py b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
index fd09c23106..bc8a464aa0 100644
--- a/source/tests/pt_expt/model/test_dpa1_graph_lower.py
+++ b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
@@ -91,7 +91,12 @@ def setup_method(self) -> None:
             [[0, 0, 0, 1, 1]], dtype=torch.int64, device=self.device
         )
 
-    def _make_model(self, attn_layer: int = 0, smooth: bool = False) -> EnergyModel:
+    def _make_model(
+        self,
+        attn_layer: int = 0,
+        smooth: bool = False,
+        pair_excl_types: list | None = None,
+    ) -> EnergyModel:
         ds = DescrptDPA1(
             self.rcut,
             self.rcut_smth,
@@ -122,7 +127,12 @@ def _make_model(self, attn_layer: int = 0, smooth: bool = False) -> EnergyModel:
             precision="float64",
             seed=GLOBAL_SEED,
         ).to(self.device)
-        return EnergyModel(ds, ft, type_map=self.type_map).to(self.device)
+        return EnergyModel(
+            ds,
+            ft,
+            type_map=self.type_map,
+            pair_exclude_types=pair_excl_types or [],
+        ).to(self.device)
 
     def _prepare_lower_inputs(self, periodic: bool):
         """Build extended coords, atype, nlist, mapping as torch tensors."""
@@ -172,13 +182,20 @@ def _prepare_lower_inputs(self, periodic: bool):
     @pytest.mark.parametrize("attn_layer", [0, 2])  # factorizable AND attention
     @pytest.mark.parametrize("periodic", [True, False])  # PBC vs non-PBC
     @pytest.mark.parametrize("do_av", [False, True])  # atom-virial off / on
-    def test_force_virial_parity_vs_legacy(self, periodic, do_av, attn_layer) -> None:
+    @pytest.mark.parametrize(
+        "excl_types", [[], [(0, 1)]]
+    )  # no exclusion / type-0-1 pair exclusion
+    def test_force_virial_parity_vs_legacy(
+        self, periodic, do_av, attn_layer, excl_types
+    ) -> None:
         """Graph lower energy/force/virial/atom_virial == legacy dense lower on
         the SAME neighbor set (regime-1 graph from from_dense_quartet).
         attn_layer=2 exercises graph attention through model-level autograd
         (smooth=False: exact carry-all parity regime, NeighborGraph PR-D).
+        Parametrized over exclude_types: empty list (no exclusion) and
+        [(0,1)] (model-level pair exclusion applied identically on both routes).
         """
-        model = self._make_model(attn_layer=attn_layer)
+        model = self._make_model(attn_layer=attn_layer, pair_excl_types=excl_types)
         model.eval()
         tol = (
             {"rtol": 1e-12, "atol": 1e-12}

From 4c026666c9ce67559a198f328f37e57b78ad3273 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 01:33:07 +0800
Subject: [PATCH 26/38] test(pt_expt): descriptor-level exclude_types export +
 graph-vs-legacy parity

---
 source/tests/pt_expt/descriptor/test_dpa1.py  |  6 +-
 .../pt_expt/model/test_dpa1_graph_lower.py    | 61 +++++++++++++++++++
 2 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/source/tests/pt_expt/descriptor/test_dpa1.py b/source/tests/pt_expt/descriptor/test_dpa1.py
index 053263009a..25b3c47a44 100644
--- a/source/tests/pt_expt/descriptor/test_dpa1.py
+++ b/source/tests/pt_expt/descriptor/test_dpa1.py
@@ -114,7 +114,10 @@ def test_consistency(self, idt, sm, to, tm, prec, ect) -> None:
 
     @pytest.mark.parametrize("idt", [False, True])  # resnet_dt
     @pytest.mark.parametrize("prec", ["float64", "float32"])  # precision
-    def test_exportable(self, idt, prec) -> None:
+    @pytest.mark.parametrize(
+        "excl_types", [[], [(0, 1)]]
+    )  # no exclusion / type-0-1 pair exclusion
+    def test_exportable(self, idt, prec, excl_types) -> None:
         rng = np.random.default_rng(GLOBAL_SEED)
         _, _, nnei = self.nlist.shape
         davg = rng.normal(size=(self.nt, nnei, 4))
@@ -130,6 +133,7 @@ def test_exportable(self, idt, prec) -> None:
             attn_layer=2,
             precision=prec,
             resnet_dt=idt,
+            exclude_types=excl_types,
             seed=GLOBAL_SEED,
         ).to(self.device)
         dd0.se_atten.mean = torch.tensor(davg, dtype=dtype, device=self.device)
diff --git a/source/tests/pt_expt/model/test_dpa1_graph_lower.py b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
index bc8a464aa0..4d98c8258a 100644
--- a/source/tests/pt_expt/model/test_dpa1_graph_lower.py
+++ b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
@@ -96,6 +96,7 @@ def _make_model(
         attn_layer: int = 0,
         smooth: bool = False,
         pair_excl_types: list | None = None,
+        descr_excl_types: list | None = None,
     ) -> EnergyModel:
         ds = DescrptDPA1(
             self.rcut,
@@ -116,6 +117,7 @@ def _make_model(
             set_davg_zero=False,
             type_one_side=True,
             precision="float64",
+            exclude_types=descr_excl_types or [],
             seed=GLOBAL_SEED,
         ).to(self.device)
         ft = InvarFitting(
@@ -405,6 +407,65 @@ def test_pair_exclude_types_graph_vs_legacy(self) -> None:
             f"pair_exclude_types had no effect on energy; diff={e_diff:.3e}"
         )
 
+    @pytest.mark.parametrize("attn_layer", [0, 2])  # factorizable AND attention
+    def test_descriptor_exclude_types_graph_vs_legacy(self, attn_layer) -> None:
+        """Descriptor-level exclude_types: graph route and legacy dense agree
+        bit-tight (fp64, 1e-12) when exclusion is on the DESCRIPTOR (not the
+        model pair_exclude_types).  Uses identical weights across both routes;
+        also checks exclusion is non-vacuous vs a no-exclude baseline.
+        """
+        import copy
+
+        # 1. no-exclude model (graph route = reference)
+        model_ref = self._make_model(attn_layer=attn_layer)
+        model_ref.eval()
+
+        # 2. exclude model: inject exclude_types into the serialized dict
+        data = copy.deepcopy(model_ref.serialize())
+        data["descriptor"]["exclude_types"] = [[0, 1]]
+        model_excl = EnergyModel.deserialize(data).to(self.device)
+        model_excl.eval()
+
+        tol = (
+            {"rtol": 1e-12, "atol": 1e-12}
+            if self.device.type == "cpu"
+            else {"rtol": 1e-10, "atol": 1e-10}
+        )
+        box = self.cell.reshape(1, 9)
+
+        # 3. graph route
+        graph_out = model_excl.call_common(
+            self.coord.clone().requires_grad_(True),
+            self.atype,
+            box,
+            neighbor_graph_method="dense",
+        )
+        # 4. legacy dense route
+        legacy_out = model_excl.call_common(
+            self.coord.clone().requires_grad_(True),
+            self.atype,
+            box,
+            neighbor_graph_method="legacy",
+        )
+        torch.testing.assert_close(
+            graph_out["energy_redu"], legacy_out["energy_redu"], **tol
+        )
+        torch.testing.assert_close(
+            graph_out["energy_derv_r"], legacy_out["energy_derv_r"], **tol
+        )
+
+        # 5. exclusion must be non-vacuous
+        ref_out = model_ref.call_common(
+            self.coord.clone().requires_grad_(True),
+            self.atype,
+            box,
+            neighbor_graph_method="dense",
+        )
+        e_diff = (graph_out["energy_redu"] - ref_out["energy_redu"]).abs().max().item()
+        assert e_diff > 1e-10, (
+            f"descriptor exclude_types had no effect on energy; diff={e_diff:.3e}"
+        )
+
     @pytest.mark.parametrize("attn_layer", [0, 2])  # factorizable AND attention
     def test_graph_route_float32(self, attn_layer) -> None:
         """A float32 model runs the graph route and matches the dense route.

From b84226313a922abaa84806400a91a39d6ea7c070 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 01:37:42 +0800
Subject: [PATCH 27/38] fix: add reduced-virial parity assertion in descriptor
 exclude_types test

---
 source/tests/pt_expt/model/test_dpa1_graph_lower.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/source/tests/pt_expt/model/test_dpa1_graph_lower.py b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
index 4d98c8258a..e285f2ac97 100644
--- a/source/tests/pt_expt/model/test_dpa1_graph_lower.py
+++ b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
@@ -15,6 +15,8 @@
 reduced (per-frame) virial are frame/local quantities and compare directly.
 """
 
+import copy
+
 import numpy as np
 import pytest
 import torch
@@ -414,8 +416,6 @@ def test_descriptor_exclude_types_graph_vs_legacy(self, attn_layer) -> None:
         model pair_exclude_types).  Uses identical weights across both routes;
         also checks exclusion is non-vacuous vs a no-exclude baseline.
         """
-        import copy
-
         # 1. no-exclude model (graph route = reference)
         model_ref = self._make_model(attn_layer=attn_layer)
         model_ref.eval()
@@ -453,6 +453,9 @@ def test_descriptor_exclude_types_graph_vs_legacy(self, attn_layer) -> None:
         torch.testing.assert_close(
             graph_out["energy_derv_r"], legacy_out["energy_derv_r"], **tol
         )
+        torch.testing.assert_close(
+            graph_out["energy_derv_c_redu"], legacy_out["energy_derv_c_redu"], **tol
+        )
 
         # 5. exclusion must be non-vacuous
         ref_out = model_ref.call_common(

From f7cdb467601f1122a906a2fc6da20ac8066fd8e7 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 01:43:31 +0800
Subject: [PATCH 28/38] fix(neighbor_graph): use logical_and+bool cast in
 apply_pair_exclusion for array_api_strict compat

---
 deepmd/dpmodel/utils/neighbor_graph/graph.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepmd/dpmodel/utils/neighbor_graph/graph.py b/deepmd/dpmodel/utils/neighbor_graph/graph.py
index dd82b545de..bef853682b 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/graph.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/graph.py
@@ -219,7 +219,7 @@ def apply_pair_exclusion(
     keep = pair_excl.build_edge_exclude_mask(graph.edge_index, atype)
     out = dataclasses.replace(
         graph,
-        edge_mask=graph.edge_mask * xp.astype(keep, graph.edge_mask.dtype),
+        edge_mask=xp.logical_and(graph.edge_mask, xp.astype(keep, xp.bool)),
     )
     if compact:
         if graph.angle_index is not None or graph.angle_mask is not None:

From 8ba6fa80dc0cdf8f6168a0a59ff38891c5d570d5 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 01:58:53 +0800
Subject: [PATCH 29/38] feat(dpmodel): apply_pair_exclusion_nlist helper +
 pair_excl on build_neighbor_list/strategies (A4)

Extract the inline pair-exclusion from base_atomic_model.forward_common_atomic
into apply_pair_exclusion_nlist(nlist, atype_ext, pair_excl) in nlist.py.
The seam is refactored to call the named helper (idempotent backstop remains).

Add pair_excl=None to:
- build_neighbor_list (dpmodel, nlist.py)
- DefaultNeighborList.build
- VesinNeighborList.build (pt_expt)
- NvNeighborList.build (pt; CUDA-only, API parity)
- NeighborList base class signature

12 new unit tests covering: None/empty identity, excluded pairs -> -1,
-1 slot preservation, ghost-atom types, idempotence, torch namespace smoke,
build_neighbor_list oracle equivalence, DefaultNeighborList oracle,
VesinNeighborList oracle. NvNeighborList CUDA-only (not validated locally).
---
 .../dpmodel/atomic_model/base_atomic_model.py |   8 +-
 deepmd/dpmodel/utils/__init__.py              |   2 +
 deepmd/dpmodel/utils/default_neighbor_list.py |  40 ++-
 deepmd/dpmodel/utils/neighbor_list.py         |  11 +
 deepmd/dpmodel/utils/nlist.py                 |  54 +++-
 deepmd/pt/utils/nv_nlist.py                   |  19 ++
 deepmd/pt_expt/utils/vesin_neighbor_list.py   |  22 ++
 .../test_apply_pair_exclusion_nlist.py        | 268 ++++++++++++++++++
 8 files changed, 416 insertions(+), 8 deletions(-)
 create mode 100644 source/tests/common/dpmodel/test_apply_pair_exclusion_nlist.py

diff --git a/deepmd/dpmodel/atomic_model/base_atomic_model.py b/deepmd/dpmodel/atomic_model/base_atomic_model.py
index 866bb22329..cbe0bdf5d4 100644
--- a/deepmd/dpmodel/atomic_model/base_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/base_atomic_model.py
@@ -36,6 +36,7 @@
 from deepmd.dpmodel.utils import (
     AtomExcludeMask,
     PairExcludeMask,
+    apply_pair_exclusion_nlist,
 )
 from deepmd.env import (
     GLOBAL_NP_FLOAT_PRECISION,
@@ -297,10 +298,9 @@ def forward_common_atomic(
         xp = array_api_compat.array_namespace(extended_coord, extended_atype, nlist)
         _, nloc, _ = nlist.shape
         atype = xp_take_first_n(extended_atype, 1, nloc)
-        if self.pair_excl is not None:
-            pair_mask = self.pair_excl.build_type_exclude_mask(nlist, extended_atype)
-            # exclude neighbors in the nlist
-            nlist = xp.where(pair_mask == 1, nlist, -1)
+        # idempotent backstop: externally-supplied nlists (C++/LAMMPS, call_lower
+        # users) bypass the in-tree builders and land here still unfiltered.
+        nlist = apply_pair_exclusion_nlist(nlist, extended_atype, self.pair_excl)
 
         ext_atom_mask = self.make_atom_mask(extended_atype)
         ret_dict = self.forward_atomic(
diff --git a/deepmd/dpmodel/utils/__init__.py b/deepmd/dpmodel/utils/__init__.py
index 3593af5c16..3e439f173e 100644
--- a/deepmd/dpmodel/utils/__init__.py
+++ b/deepmd/dpmodel/utils/__init__.py
@@ -48,6 +48,7 @@
     make_multilayer_network,
 )
 from .nlist import (
+    apply_pair_exclusion_nlist,
     build_multiple_neighbor_list,
     build_neighbor_list,
     extend_coord_with_ghosts,
@@ -94,6 +95,7 @@
     "PairExcludeMask",
     "SameNlocBatchSampler",
     "aggregate",
+    "apply_pair_exclusion_nlist",
     "build_multiple_neighbor_list",
     "build_neighbor_graph",
     "build_neighbor_graph_ase",
diff --git a/deepmd/dpmodel/utils/default_neighbor_list.py b/deepmd/dpmodel/utils/default_neighbor_list.py
index 3628664c5a..c759e2f21d 100644
--- a/deepmd/dpmodel/utils/default_neighbor_list.py
+++ b/deepmd/dpmodel/utils/default_neighbor_list.py
@@ -1,6 +1,10 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 """Default all-pairs neighbor-list builder (historical deepmd behavior)."""
 
+from typing import (
+    TYPE_CHECKING,
+)
+
 import array_api_compat
 
 from deepmd.dpmodel.array_api import (
@@ -21,6 +25,9 @@
     normalize_coord,
 )
 
+if TYPE_CHECKING:
+    from deepmd.dpmodel.utils.exclude_mask import PairExcludeMask
+
 
 class DefaultNeighborList(NeighborList):
     """All-pairs builder: replicate the cell into periodic images and rank by
@@ -37,7 +44,36 @@ def build(
         rcut: float,
         sel: list[int],
         return_mode: str = "extended",
+        pair_excl: "PairExcludeMask | None" = None,
     ) -> tuple[Array, Array, Array, Array] | EdgeNeighborList:
+        """Build extended coordinates and a candidate neighbor list.
+
+        Parameters
+        ----------
+        coord : Array
+            Local coordinates, shape ``(nf, nloc, 3)`` or ``(nf, nloc*3)``.
+        atype : Array
+            Local atom types, shape ``(nf, nloc)``.
+        box : Array or None
+            Simulation cell, shape ``(nf, 3, 3)`` or ``(nf, 9)``; ``None``
+            for non-periodic systems.
+        rcut : float
+            Cutoff radius.
+        sel : list[int]
+            Number of selected neighbors per type.
+        return_mode : str
+            Must be ``"extended"`` (the only mode this builder supports).
+        pair_excl : PairExcludeMask or None, optional
+            When provided, excluded type pairs are erased from the returned
+            neighbor list immediately after the geometric search by
+            :func:`~deepmd.dpmodel.utils.nlist.build_neighbor_list`.
+
+        Returns
+        -------
+        tuple[Array, Array, Array, Array]
+            ``(extended_coord, extended_atype, nlist, mapping)`` as documented
+            in :meth:`~deepmd.dpmodel.utils.neighbor_list.NeighborList.build`.
+        """
         if return_mode != "extended":
             raise NotImplementedError(
                 "DefaultNeighborList only supports the extended-coordinate contract."
@@ -54,7 +90,8 @@ def build(
         extended_coord, extended_atype, mapping = extend_coord_with_ghosts(
             coord_normalized, atype, box, rcut
         )
-        # types are distinguished in the lower interface, so keep them merged here
+        # types are distinguished in the lower interface, so keep them merged here;
+        # pair_excl is forwarded so exclusion is applied at build time.
         nlist = build_neighbor_list(
             extended_coord,
             extended_atype,
@@ -62,6 +99,7 @@ def build(
             rcut,
             sel,
             distinguish_types=False,
+            pair_excl=pair_excl,
         )
         extended_coord = xp.reshape(extended_coord, (nframes, -1, 3))
         return extended_coord, extended_atype, nlist, mapping
diff --git a/deepmd/dpmodel/utils/neighbor_list.py b/deepmd/dpmodel/utils/neighbor_list.py
index e63f5099dd..37c0f63309 100644
--- a/deepmd/dpmodel/utils/neighbor_list.py
+++ b/deepmd/dpmodel/utils/neighbor_list.py
@@ -14,6 +14,7 @@
     dataclass,
 )
 from typing import (
+    TYPE_CHECKING,
     Literal,
 )
 
@@ -21,6 +22,9 @@
     Array,
 )
 
+if TYPE_CHECKING:
+    from deepmd.dpmodel.utils.exclude_mask import PairExcludeMask
+
 
 @dataclass
 class EdgeNeighborList:
@@ -69,6 +73,7 @@ def build(
         rcut: float,
         sel: list[int],
         return_mode: Literal["extended", "edges"] = "extended",
+        pair_excl: "PairExcludeMask | None" = None,
     ) -> tuple[Array, Array, Array, Array] | EdgeNeighborList:
         """Build the extended system and a candidate neighbor list.
 
@@ -88,6 +93,12 @@ def build(
             ``"extended"`` returns the historical extended-coordinate quartet.
             ``"edges"`` returns :class:`EdgeNeighborList`, where ``edge_vec`` is
             the only geometric displacement consumed by the model.
+        pair_excl : PairExcludeMask or None, optional
+            When provided, excluded type pairs are erased from the returned
+            neighbor list (entries set to ``-1``) by calling
+            :func:`~deepmd.dpmodel.utils.nlist.apply_pair_exclusion_nlist`.
+            Implementations that do not override this parameter fall back to
+            the default post-build application in the base interface.
 
         Returns
         -------
diff --git a/deepmd/dpmodel/utils/nlist.py b/deepmd/dpmodel/utils/nlist.py
index 59e68a64a0..9d6a95cb48 100644
--- a/deepmd/dpmodel/utils/nlist.py
+++ b/deepmd/dpmodel/utils/nlist.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 
 from typing import (
+    TYPE_CHECKING,
     Any,
 )
 
@@ -17,6 +18,9 @@
     to_face_distance,
 )
 
+if TYPE_CHECKING:
+    from deepmd.dpmodel.utils.exclude_mask import PairExcludeMask
+
 
 def _is_ndtensorflow_namespace(xp: Any) -> bool:
     return getattr(xp, "__name__", "") == "deepmd._vendors.ndtensorflow"
@@ -71,6 +75,44 @@ def extend_input_and_build_neighbor_list(
     return extended_coord, extended_atype, mapping, nlist
 
 
+def apply_pair_exclusion_nlist(
+    nlist: Array,
+    atype_ext: Array,
+    pair_excl: "PairExcludeMask | None",
+) -> Array:
+    """Apply model-level pair-type exclusion to a dense neighbor list.
+
+    Replaces excluded neighbor entries with ``-1`` so that downstream
+    descriptors see them as empty slots.  Identity (returns ``nlist``
+    unchanged) when *pair_excl* is ``None`` or its exclude-types list is
+    empty.
+
+    This is the nlist-representation counterpart of
+    :func:`deepmd.dpmodel.utils.neighbor_graph.apply_pair_exclusion`.
+
+    Parameters
+    ----------
+    nlist : Array
+        Dense neighbor list of shape ``(nf, nloc, nnei)``.  Entries equal
+        to ``-1`` indicate empty / padding slots.
+    atype_ext : Array
+        Extended atom types of shape ``(nf, nall)``.
+    pair_excl : PairExcludeMask or None
+        Exclusion mask object, or ``None`` / empty to skip.
+
+    Returns
+    -------
+    Array
+        Neighbor list of the same shape with excluded entries set to ``-1``.
+        Erasing ``-1`` entries a second time is a no-op (idempotent).
+    """
+    if pair_excl is None or len(pair_excl.exclude_types) == 0:
+        return nlist
+    xp = array_api_compat.array_namespace(nlist, atype_ext)
+    pair_mask = pair_excl.build_type_exclude_mask(nlist, atype_ext)
+    return xp.where(pair_mask == 1, nlist, xp.full_like(nlist, -1))
+
+
 ## translated from torch implementation by chatgpt
 def build_neighbor_list(
     coord: Array,
@@ -79,6 +121,7 @@ def build_neighbor_list(
     rcut: float,
     sel: int | list[int],
     distinguish_types: bool = True,
+    pair_excl: "PairExcludeMask | None" = None,
 ) -> Array:
     """Build neighbor list for a single frame. keeps nsel neighbors.
 
@@ -100,6 +143,12 @@ def build_neighbor_list(
         types.
     distinguish_types : bool
         distinguish different types.
+    pair_excl : PairExcludeMask or None, optional
+        When provided, excluded type pairs are erased from the returned
+        neighbor list (entries set to ``-1``) immediately after the
+        geometric search.  This is a convenience shortcut for calling
+        :func:`apply_pair_exclusion_nlist` separately.  ``None`` (default)
+        leaves the list unchanged.
 
     Returns
     -------
@@ -195,9 +244,8 @@ def build_neighbor_list(
     )
 
     if distinguish_types:
-        return nlist_distinguish_types(nlist, atype, sel)
-    else:
-        return nlist
+        nlist = nlist_distinguish_types(nlist, atype, sel)
+    return apply_pair_exclusion_nlist(nlist, atype, pair_excl)
 
 
 def nlist_distinguish_types(
diff --git a/deepmd/pt/utils/nv_nlist.py b/deepmd/pt/utils/nv_nlist.py
index 08fc308f72..80d6c85cde 100644
--- a/deepmd/pt/utils/nv_nlist.py
+++ b/deepmd/pt/utils/nv_nlist.py
@@ -51,6 +51,8 @@
         Iterator,
     )
 
+    from deepmd.dpmodel.utils.exclude_mask import PairExcludeMask
+
 
 @contextlib.contextmanager
 def _suppress_native_stderr() -> Iterator[None]:
@@ -155,11 +157,22 @@ def build(
         rcut: float,
         sel: list[int],
         return_mode: str = "extended",
+        pair_excl: PairExcludeMask | None = None,
     ) -> tuple[Any, Any, Any, Any] | EdgeNeighborList:
         """Build the extended system and neighbor list.
 
         See :meth:`deepmd.dpmodel.utils.neighbor_list.NeighborList.build`. The
         returned ``nlist`` is distance-sorted and truncated to ``sum(sel)``.
+
+        Parameters
+        ----------
+        pair_excl : PairExcludeMask or None, optional
+            When provided, excluded type pairs are erased from the returned
+            neighbor list (entries set to ``-1``) by
+            :func:`~deepmd.dpmodel.utils.nlist.apply_pair_exclusion_nlist`.
+            ``NvNeighborList`` is CUDA-only; the ``pair_excl`` parameter is
+            accepted for API parity with the other strategies but cannot be
+            validated on a CPU-only machine.
         """
         device = coord.device
         nf, nloc = atype.shape[:2]
@@ -207,6 +220,12 @@ def build(
             nlist = _truncate_to_sel_compiled(
                 extended_coord, nlist, target_neighbors, float(rcut)
             )
+            if pair_excl is not None:
+                from deepmd.dpmodel.utils.nlist import (
+                    apply_pair_exclusion_nlist,
+                )
+
+                nlist = apply_pair_exclusion_nlist(nlist, extended_atype, pair_excl)
             return extended_coord, extended_atype, nlist, mapping
 
 
diff --git a/deepmd/pt_expt/utils/vesin_neighbor_list.py b/deepmd/pt_expt/utils/vesin_neighbor_list.py
index 39a0dff883..1b141ebad1 100644
--- a/deepmd/pt_expt/utils/vesin_neighbor_list.py
+++ b/deepmd/pt_expt/utils/vesin_neighbor_list.py
@@ -19,6 +19,7 @@
 """
 
 from typing import (
+    TYPE_CHECKING,
     Any,
 )
 
@@ -28,6 +29,9 @@
     EdgeNeighborList,
     NeighborList,
 )
+
+if TYPE_CHECKING:
+    from deepmd.dpmodel.utils.exclude_mask import PairExcludeMask
 from deepmd.pt_expt.utils.edge_schema import (
     edge_schema_from_ij_shifts,
     merge_frame_edge_schemas,
@@ -60,6 +64,7 @@ def build(
         rcut: float,
         sel: list[int],
         return_mode: str = "extended",
+        pair_excl: "PairExcludeMask | None" = None,
     ) -> tuple[Any, Any, Any, Any] | EdgeNeighborList:
         """Build the extended system + candidate neighbor list with vesin.
 
@@ -67,6 +72,16 @@ def build(
         returned ``nlist`` is distance-sorted and truncated to ``sum(sel)``
         (matching the default builder); the lower interface still re-formats /
         type-splits it.
+
+        Parameters
+        ----------
+        pair_excl : PairExcludeMask or None, optional
+            When provided, excluded type pairs are erased from the returned
+            neighbor list (entries set to ``-1``) by
+            :func:`~deepmd.dpmodel.utils.nlist.apply_pair_exclusion_nlist`.
+            The atomic-model seam applies the same filter as an idempotent
+            backstop, so passing ``pair_excl`` here is a build-time
+            optimization that avoids re-scanning per forward call.
         """
         is_numpy = not isinstance(coord, torch.Tensor)
         # vesin runs on the device of the inputs: numpy (the dpmodel backend) is
@@ -146,6 +161,13 @@ def build(
         nlist = torch.stack(nlists, dim=0)
         mapping = torch.stack(mappings, dim=0)
 
+        if pair_excl is not None:
+            from deepmd.dpmodel.utils.nlist import (
+                apply_pair_exclusion_nlist,
+            )
+
+            nlist = apply_pair_exclusion_nlist(nlist, extended_atype, pair_excl)
+
         if is_numpy:
             return (
                 extended_coord.detach().cpu().numpy(),
diff --git a/source/tests/common/dpmodel/test_apply_pair_exclusion_nlist.py b/source/tests/common/dpmodel/test_apply_pair_exclusion_nlist.py
new file mode 100644
index 0000000000..1ecdf8dfb5
--- /dev/null
+++ b/source/tests/common/dpmodel/test_apply_pair_exclusion_nlist.py
@@ -0,0 +1,268 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Unit tests for :func:`apply_pair_exclusion_nlist` and the
+``pair_excl`` parameter of :func:`build_neighbor_list` /
+``DefaultNeighborList.build``.
+"""
+
+import numpy as np
+import pytest
+
+from deepmd.dpmodel.utils.exclude_mask import PairExcludeMask
+from deepmd.dpmodel.utils.nlist import (
+    apply_pair_exclusion_nlist,
+    build_neighbor_list,
+)
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+def _make_nlist_atype():
+    """2-frame, 3-local, 4-neighbor test fixture.
+
+    nlist[f, i, k] = index of the k-th neighbor of atom i in frame f.
+    -1 = empty slot.
+
+    Frame 0 - atoms 0,1,2 (types 0,1,0); ghost atom 3 (type 1).
+    Frame 1 - same layout, different neighbor assignment.
+    """
+    # shape (nf=2, nloc=3, nnei=4)
+    nlist = np.array(
+        [
+            # frame 0
+            [[1, 2, 3, -1], [0, 3, -1, -1], [0, 1, -1, -1]],
+            # frame 1
+            [[2, 3, -1, -1], [0, -1, -1, -1], [1, 2, 3, -1]],
+        ],
+        dtype=np.int64,
+    )
+    # extended atype: local atoms + 1 ghost; shape (nf=2, nall=4)
+    # types: atom0=0, atom1=1, atom2=0, ghost3=1
+    atype_ext = np.array(
+        [[0, 1, 0, 1], [0, 1, 0, 1]],
+        dtype=np.int64,
+    )
+    return nlist, atype_ext
+
+
+# ---------------------------------------------------------------------------
+# apply_pair_exclusion_nlist — unit tests
+# ---------------------------------------------------------------------------
+
+
+def test_none_is_identity() -> None:
+    nlist, atype_ext = _make_nlist_atype()
+    result = apply_pair_exclusion_nlist(nlist, atype_ext, None)
+    assert result is nlist
+
+
+def test_empty_exclude_list_is_identity() -> None:
+    nlist, atype_ext = _make_nlist_atype()
+    excl = PairExcludeMask(2, [])
+    result = apply_pair_exclusion_nlist(nlist, atype_ext, excl)
+    assert result is nlist
+
+
+def test_excluded_pairs_become_minus_one() -> None:
+    """Neighbors of excluded type should become -1."""
+    nlist, atype_ext = _make_nlist_atype()
+    # Exclude (type0, type1) pairs (symmetric: also type1,type0).
+    excl = PairExcludeMask(2, [(0, 1)])
+    result = apply_pair_exclusion_nlist(nlist, atype_ext, excl)
+
+    # Frame 0, atom 0 (type 0): neighbors 1(type1), 2(type0), 3(type1), -1
+    # (0,1) and (0,3) are excluded; (0,2) kept; -1 stays -1
+    np.testing.assert_array_equal(result[0, 0], [-1, 2, -1, -1])
+    # Frame 0, atom 1 (type 1): neighbors 0(type0), 3(type1), -1, -1
+    # (1,0) excluded; (1,3) kept (type1,type1 not excluded)
+    np.testing.assert_array_equal(result[0, 1], [-1, 3, -1, -1])
+    # Frame 0, atom 2 (type 0): neighbors 0(type0), 1(type1), -1, -1
+    # (2,0) kept (0,0); (2,1) excluded (0,1)
+    np.testing.assert_array_equal(result[0, 2], [0, -1, -1, -1])
+
+
+def test_already_empty_slots_preserved() -> None:
+    """Entries that are already -1 must remain -1 after exclusion."""
+    nlist, atype_ext = _make_nlist_atype()
+    excl = PairExcludeMask(2, [(0, 1)])
+    result = apply_pair_exclusion_nlist(nlist, atype_ext, excl)
+    # -1 positions in the original nlist must still be -1
+    original_minus_one = nlist == -1
+    np.testing.assert_array_equal(result[original_minus_one], -1)
+
+
+def test_ghost_atom_type_respected() -> None:
+    """Ghost atoms (index >= nloc) are indexed by atype_ext; their types count."""
+    nlist, atype_ext = _make_nlist_atype()
+    # atype_ext[*,3] == 1; atom 0 is type 0.  (0,1) is excluded.
+    # atom 0 in frame 0 lists neighbor 3 (ghost, type 1) -> should be excluded.
+    excl = PairExcludeMask(2, [(0, 1)])
+    result = apply_pair_exclusion_nlist(nlist, atype_ext, excl)
+    assert result[0, 0, 2] == -1  # was 3 (type 1), now excluded
+
+
+def test_idempotent_double_application() -> None:
+    """Applying exclusion twice must give the same result as applying once."""
+    nlist, atype_ext = _make_nlist_atype()
+    excl = PairExcludeMask(2, [(0, 1)])
+    once = apply_pair_exclusion_nlist(nlist, atype_ext, excl)
+    twice = apply_pair_exclusion_nlist(once, atype_ext, excl)
+    np.testing.assert_array_equal(once, twice)
+
+
+def test_no_matching_pairs_leaves_nlist_unchanged() -> None:
+    """Exclusion list non-empty but no edge matches — nlist unchanged."""
+    nlist, atype_ext = _make_nlist_atype()
+    # All atoms are type 0 or 1; exclude (2,3) which doesn't appear.
+    excl = PairExcludeMask(4, [(2, 3)])
+    result = apply_pair_exclusion_nlist(nlist, atype_ext, excl)
+    np.testing.assert_array_equal(result, nlist)
+
+
+# ---------------------------------------------------------------------------
+# apply_pair_exclusion_nlist — torch namespace smoke test
+# ---------------------------------------------------------------------------
+
+
+def test_torch_namespace_smoke() -> None:
+    torch = pytest.importorskip("torch")
+    nlist, atype_ext = _make_nlist_atype()
+    nlist_t = torch.from_numpy(nlist)
+    atype_t = torch.from_numpy(atype_ext)
+    excl = PairExcludeMask(2, [(0, 1)])
+    result = apply_pair_exclusion_nlist(nlist_t, atype_t, excl)
+    np.testing.assert_array_equal(
+        result.numpy(), apply_pair_exclusion_nlist(nlist, atype_ext, excl)
+    )
+
+
+# ---------------------------------------------------------------------------
+# build_neighbor_list pair_excl parameter
+# ---------------------------------------------------------------------------
+
+
+def _simple_extended_system():
+    """2-atom local system with 1 ghost to give a nontrivial nlist.
+
+    Atoms: local 0 (type 0) at (0,0,0), local 1 (type 1) at (1,0,0).
+    Ghost 2 (type 1) at (-1,0,0) (periodic image of atom 1 across pbc).
+
+    rcut=1.5: atom 0 sees neighbors 1, 2; atom 1 sees neighbor 0.
+    """
+    # nf=1, nall=3
+    coord_ext = np.array(
+        [[[0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [-1.0, 0.0, 0.0]]],
+        dtype=np.float64,
+    )
+    atype_ext = np.array([[0, 1, 1]], dtype=np.int64)
+    return coord_ext, atype_ext
+
+
+def test_build_neighbor_list_pair_excl_equals_post_application() -> None:
+    """build_neighbor_list(pair_excl=excl) must equal apply_pair_exclusion_nlist
+    applied to the result without pair_excl (oracle equivalence).
+    """
+    coord_ext, atype_ext = _simple_extended_system()
+    nloc = 2
+    rcut = 1.5
+    sel = [4]
+    excl = PairExcludeMask(2, [(0, 1)])
+
+    nlist_plain = build_neighbor_list(
+        coord_ext, atype_ext, nloc, rcut, sel, distinguish_types=False
+    )
+    nlist_excl_builtin = build_neighbor_list(
+        coord_ext, atype_ext, nloc, rcut, sel, distinguish_types=False, pair_excl=excl
+    )
+    nlist_excl_manual = apply_pair_exclusion_nlist(nlist_plain, atype_ext, excl)
+
+    np.testing.assert_array_equal(nlist_excl_builtin, nlist_excl_manual)
+
+
+def test_build_neighbor_list_none_excl_unchanged() -> None:
+    """pair_excl=None must not change the nlist."""
+    coord_ext, atype_ext = _simple_extended_system()
+    nloc = 2
+    rcut = 1.5
+    sel = [4]
+
+    nlist_plain = build_neighbor_list(
+        coord_ext, atype_ext, nloc, rcut, sel, distinguish_types=False
+    )
+    nlist_with_none = build_neighbor_list(
+        coord_ext, atype_ext, nloc, rcut, sel, distinguish_types=False, pair_excl=None
+    )
+    np.testing.assert_array_equal(nlist_plain, nlist_with_none)
+
+
+# ---------------------------------------------------------------------------
+# DefaultNeighborList.build pair_excl parameter
+# ---------------------------------------------------------------------------
+
+
+def _local_system():
+    """Return a simple local (non-extended) 2-atom system for DefaultNeighborList."""
+    # shape (nf=1, nloc=2, 3)
+    coord = np.array([[[0.0, 0.0, 0.0], [1.0, 0.0, 0.0]]], dtype=np.float64)
+    atype = np.array([[0, 1]], dtype=np.int64)
+    return coord, atype
+
+
+def test_default_neighbor_list_pair_excl_equals_seam() -> None:
+    """DefaultNeighborList(pair_excl=excl) nlist equals build-then-apply."""
+    from deepmd.dpmodel.utils.default_neighbor_list import DefaultNeighborList
+
+    coord, atype = _local_system()
+    rcut = 1.5
+    sel = [4]
+    excl = PairExcludeMask(2, [(0, 1)])
+
+    builder = DefaultNeighborList()
+    # Build without exclusion
+    ext_coord, ext_atype, nlist_plain, _ = builder.build(
+        coord, atype, box=None, rcut=rcut, sel=sel
+    )
+    nlist_manual = apply_pair_exclusion_nlist(nlist_plain, ext_atype, excl)
+
+    # Build with exclusion at builder level
+    _, ext_atype2, nlist_builtin, _ = builder.build(
+        coord, atype, box=None, rcut=rcut, sel=sel, pair_excl=excl
+    )
+    np.testing.assert_array_equal(ext_atype, ext_atype2)
+    np.testing.assert_array_equal(nlist_builtin, nlist_manual)
+
+
+# ---------------------------------------------------------------------------
+# VesinNeighborList.build pair_excl (torch only)
+# ---------------------------------------------------------------------------
+
+
+def test_vesin_neighbor_list_pair_excl_equals_seam() -> None:
+    """VesinNeighborList(pair_excl=excl) nlist equals build-then-apply."""
+    torch = pytest.importorskip("torch")
+    from deepmd.pt_expt.utils.vesin_neighbor_list import (
+        VesinNeighborList,
+        is_vesin_torch_available,
+    )
+
+    if not is_vesin_torch_available():
+        pytest.skip("vesin.torch not installed")
+
+    coord = torch.tensor([[[0.0, 0.0, 0.0], [1.0, 0.0, 0.0]]], dtype=torch.float64)
+    atype = torch.tensor([[0, 1]], dtype=torch.int64)
+    rcut = 1.5
+    sel = [4]
+    excl = PairExcludeMask(2, [(0, 1)])
+
+    builder = VesinNeighborList()
+    ext_coord, ext_atype, nlist_plain, _ = builder.build(
+        coord, atype, box=None, rcut=rcut, sel=sel
+    )
+    nlist_manual = apply_pair_exclusion_nlist(nlist_plain, ext_atype, excl)
+
+    _, ext_atype2, nlist_builtin, _ = builder.build(
+        coord, atype, box=None, rcut=rcut, sel=sel, pair_excl=excl
+    )
+    np.testing.assert_array_equal(nlist_builtin.numpy(), nlist_manual.numpy())

From 4a9b6b5e88cde3f9b3c39e33a8235c8144f61d93 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 02:06:17 +0800
Subject: [PATCH 30/38] fix(dpmodel): guard edges+pair_excl, fix base docstring
 (A4 review)

---
 deepmd/dpmodel/utils/neighbor_list.py         |  3 +--
 deepmd/pt/utils/nv_nlist.py                   |  7 ++++++
 deepmd/pt_expt/utils/vesin_neighbor_list.py   |  7 ++++++
 .../test_apply_pair_exclusion_nlist.py        | 25 +++++++++++++++++++
 .../pt_expt/utils/test_vesin_graph_builder.py | 12 +++++++++
 5 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/deepmd/dpmodel/utils/neighbor_list.py b/deepmd/dpmodel/utils/neighbor_list.py
index 37c0f63309..947e8d2d33 100644
--- a/deepmd/dpmodel/utils/neighbor_list.py
+++ b/deepmd/dpmodel/utils/neighbor_list.py
@@ -97,8 +97,7 @@ def build(
             When provided, excluded type pairs are erased from the returned
             neighbor list (entries set to ``-1``) by calling
             :func:`~deepmd.dpmodel.utils.nlist.apply_pair_exclusion_nlist`.
-            Implementations that do not override this parameter fall back to
-            the default post-build application in the base interface.
+            Subclasses are expected to apply this filter before returning.
 
         Returns
         -------
diff --git a/deepmd/pt/utils/nv_nlist.py b/deepmd/pt/utils/nv_nlist.py
index 80d6c85cde..c6cb7c4ef8 100644
--- a/deepmd/pt/utils/nv_nlist.py
+++ b/deepmd/pt/utils/nv_nlist.py
@@ -173,7 +173,14 @@ def build(
             ``NvNeighborList`` is CUDA-only; the ``pair_excl`` parameter is
             accepted for API parity with the other strategies but cannot be
             validated on a CPU-only machine.
+            ``return_mode='edges'`` does not support ``pair_excl``; a
+            :class:`NotImplementedError` is raised in that combination.
         """
+        if return_mode == "edges" and pair_excl is not None:
+            raise NotImplementedError(
+                "pair_excl is not supported with return_mode='edges'; "
+                "use apply_pair_exclusion (graph variant) on the returned EdgeNeighborList."
+            )
         device = coord.device
         nf, nloc = atype.shape[:2]
         target_neighbors = int(sum(sel))
diff --git a/deepmd/pt_expt/utils/vesin_neighbor_list.py b/deepmd/pt_expt/utils/vesin_neighbor_list.py
index 1b141ebad1..4965dbf77a 100644
--- a/deepmd/pt_expt/utils/vesin_neighbor_list.py
+++ b/deepmd/pt_expt/utils/vesin_neighbor_list.py
@@ -82,7 +82,14 @@ def build(
             The atomic-model seam applies the same filter as an idempotent
             backstop, so passing ``pair_excl`` here is a build-time
             optimization that avoids re-scanning per forward call.
+            ``return_mode='edges'`` does not support ``pair_excl``; a
+            :class:`NotImplementedError` is raised in that combination.
         """
+        if return_mode == "edges" and pair_excl is not None:
+            raise NotImplementedError(
+                "pair_excl is not supported with return_mode='edges'; "
+                "use apply_pair_exclusion (graph variant) on the returned EdgeNeighborList."
+            )
         is_numpy = not isinstance(coord, torch.Tensor)
         # vesin runs on the device of the inputs: numpy (the dpmodel backend) is
         # bridged through CPU torch; torch tensors stay on their own device.  Pin
diff --git a/source/tests/common/dpmodel/test_apply_pair_exclusion_nlist.py b/source/tests/common/dpmodel/test_apply_pair_exclusion_nlist.py
index 1ecdf8dfb5..fde86e7530 100644
--- a/source/tests/common/dpmodel/test_apply_pair_exclusion_nlist.py
+++ b/source/tests/common/dpmodel/test_apply_pair_exclusion_nlist.py
@@ -266,3 +266,28 @@ def test_vesin_neighbor_list_pair_excl_equals_seam() -> None:
         coord, atype, box=None, rcut=rcut, sel=sel, pair_excl=excl
     )
     np.testing.assert_array_equal(nlist_builtin.numpy(), nlist_manual.numpy())
+
+
+# ---------------------------------------------------------------------------
+# NvNeighborList.build: edges + pair_excl raises
+# ---------------------------------------------------------------------------
+
+
+def test_nv_nlist_edges_pair_excl_raises():
+    """NvNeighborList.build raises NotImplementedError for edges+pair_excl.
+
+    The guard fires before any CUDA search, so this test runs on CPU.
+    NvNeighborList requires CUDA to produce results, but the early-exit
+    raise is device-independent.
+    """
+    import torch
+
+    from deepmd.dpmodel.utils.exclude_mask import PairExcludeMask
+    from deepmd.pt.utils.nv_nlist import NvNeighborList
+
+    coord = torch.zeros((1, 4, 3), dtype=torch.float64)
+    atype = torch.zeros((1, 4), dtype=torch.int64)
+    pe = PairExcludeMask(2, [(0, 1)])
+    nl = NvNeighborList()
+    with pytest.raises(NotImplementedError, match="return_mode='edges'"):
+        nl.build(coord, atype, None, 2.0, [4], return_mode="edges", pair_excl=pe)
diff --git a/source/tests/pt_expt/utils/test_vesin_graph_builder.py b/source/tests/pt_expt/utils/test_vesin_graph_builder.py
index ce9ac567f9..41cf0245d3 100644
--- a/source/tests/pt_expt/utils/test_vesin_graph_builder.py
+++ b/source/tests/pt_expt/utils/test_vesin_graph_builder.py
@@ -158,3 +158,15 @@ def test_vesin_pair_excl_oracle_set_equality(periodic):
     assert int(np.asarray(ng_vesin.edge_mask).sum()) < int(
         np.asarray(ng_plain.edge_mask).sum()
     )
+
+
+def test_vesin_nlist_edges_pair_excl_raises():
+    """VesinNeighborList.build with return_mode='edges' and pair_excl raises NotImplementedError."""
+    from deepmd.pt_expt.utils.vesin_neighbor_list import VesinNeighborList
+
+    coord = torch.zeros((1, 4, 3), dtype=torch.float64)
+    atype = torch.zeros((1, 4), dtype=torch.int64)
+    pe = PairExcludeMask(2, [(0, 1)])
+    nl = VesinNeighborList()
+    with pytest.raises(NotImplementedError, match="return_mode='edges'"):
+        nl.build(coord, atype, None, 2.0, [4], return_mode="edges", pair_excl=pe)

From 32e8d4e9595e46f406143dd1bb2a43cb36e091d2 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 02:11:41 +0800
Subject: [PATCH 31/38] feat(pt_expt): serialize pair_exclude_types into .pt2
 metadata + C++ cross-refs

Serialize the model-level pair_exclude_types into metadata.json so the C++
DeepPotPTExpt loader can rebuild the flat (ntypes+1)^2 keep table and re-apply
the same pair-exclusion transform at the ingestion seam. Add See Also
cross-references between the Python transforms (apply_pair_exclusion,
apply_pair_exclusion_nlist) and their forthcoming C++ twins.
---
 deepmd/dpmodel/utils/neighbor_graph/graph.py |  9 ++++++++
 deepmd/dpmodel/utils/nlist.py                |  8 +++++++
 deepmd/pt_expt/utils/serialization.py        | 23 ++++++++++++++++++++
 3 files changed, 40 insertions(+)

diff --git a/deepmd/dpmodel/utils/neighbor_graph/graph.py b/deepmd/dpmodel/utils/neighbor_graph/graph.py
index bef853682b..9586461312 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/graph.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/graph.py
@@ -210,6 +210,15 @@ def apply_pair_exclusion(
     NeighborGraph
         A ``dataclasses.replace`` copy (or the original ``graph`` on early
         exit) with the exclusion applied.
+
+    See Also
+    --------
+    C++ twin ``applyPairExclusion`` in ``source/api_cc/include/commonPT.h``
+        The inference-path mirror. Same argument order (edge_index, edge_mask,
+        atype, ...), same variable names (``type_ij``, ``keep``): it computes
+        ``type_ij = atype[dst]*(ntypes+1) + atype[src]`` and ANDs the flat
+        ``(ntypes+1)^2`` table lookup into ``edge_mask`` (mask-only mode; no
+        compact variant on the compiled path).
     """
     import dataclasses
 
diff --git a/deepmd/dpmodel/utils/nlist.py b/deepmd/dpmodel/utils/nlist.py
index 9d6a95cb48..d4330d21a9 100644
--- a/deepmd/dpmodel/utils/nlist.py
+++ b/deepmd/dpmodel/utils/nlist.py
@@ -90,6 +90,14 @@ def apply_pair_exclusion_nlist(
     This is the nlist-representation counterpart of
     :func:`deepmd.dpmodel.utils.neighbor_graph.apply_pair_exclusion`.
 
+    See Also
+    --------
+    C++ twin ``applyPairExclusionNlist`` in ``source/api_cc/include/commonPT.h``
+        The inference-path mirror. Same argument order (nlist, atype_ext, ...),
+        same variable names (``type_ij``, ``keep``): it computes ``type_ij``
+        from the center/neighbor types via the flat ``(ntypes+1)^2`` table and
+        replaces excluded entries with ``-1``.
+
     Parameters
     ----------
     nlist : Array
diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
index 9317e87f4d..ef7a1d69b6 100644
--- a/deepmd/pt_expt/utils/serialization.py
+++ b/deepmd/pt_expt/utils/serialization.py
@@ -738,6 +738,29 @@ def _probe_has_message_passing(obj: object) -> bool | None:
     #   "graph" → NeighborGraph (atype, n_node, edge_index, edge_vec, edge_mask)
     # The C++ loader branches on this to build the matching inputs.
     meta["lower_input_kind"] = "graph" if lower_kind == "graph" else "nlist"
+
+    # Model-level pair-type exclusion (``pair_exclude_types``): a list of
+    # ``[ti, tj]`` type pairs whose interaction is dropped.  The compiled AOTI
+    # forward ALREADY applies this exclusion internally (the graph seam
+    # ``apply_pair_exclusion`` / dense seam ``apply_pair_exclusion_nlist`` is
+    # traced into the exported artifact), so this field is redundant for the
+    # compiled math.  It is serialized so that the C++ loaders
+    # (``DeepPotPTExpt::init``) can rebuild the flat ``(ntypes+1)^2`` keep table
+    # and re-apply the SAME transform (``applyPairExclusion`` /
+    # ``applyPairExclusionNlist`` in ``commonPT.h``) at the ingestion seam as an
+    # idempotent backstop, keeping the C++ ingestion path side-by-side
+    # reviewable with the Python transforms.  Descriptor-level ``exclude_types``
+    # needs NO metadata: it is fully inside the compiled graph.
+    pair_exclude_types: list[list[int]] = []
+    for obj in (
+        getattr(model, "atomic_model", None),
+        model,
+    ):
+        pet = getattr(obj, "pair_exclude_types", None)
+        if pet:
+            pair_exclude_types = [[int(ti), int(tj)] for (ti, tj) in pet]
+            break
+    meta["pair_exclude_types"] = pair_exclude_types
     return meta
 
 

From d4c6bcf540a6cbad0dff1494720432ab86f4a24d Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 02:17:03 +0800
Subject: [PATCH 32/38] feat(api_cc): C++ pair-exclusion twins at the pt_expt
 ingestion seam

Add buildPairExcludeTable / applyPairExclusion (graph) / applyPairExclusionNlist
(dense) in commonPT.h, structurally mirroring the Python transforms
(apply_pair_exclusion, apply_pair_exclusion_nlist) with the same argument order
and variable names (type_ij, keep). DeepPotPTExpt::init rebuilds the flat
(ntypes+1)^2 keep table from the pair_exclude_types metadata; the seam applies
it before every model call (graph and dense, LAMMPS and standalone) as an
idempotent backstop to the exclusion already compiled into the .pt2.
---
 source/api_cc/include/DeepPotPTExpt.h |   7 ++
 source/api_cc/include/commonPT.h      | 141 ++++++++++++++++++++++++++
 source/api_cc/src/DeepPotPTExpt.cc    |  46 +++++++--
 3 files changed, 188 insertions(+), 6 deletions(-)

diff --git a/source/api_cc/include/DeepPotPTExpt.h b/source/api_cc/include/DeepPotPTExpt.h
index ddaea35646..1c3400cc4b 100644
--- a/source/api_cc/include/DeepPotPTExpt.h
+++ b/source/api_cc/include/DeepPotPTExpt.h
@@ -331,6 +331,13 @@ class DeepPotPTExpt : public DeepPotBackend {
   // continue to work; GNN archives must be regenerated to opt into
   // the fail-fast guard against the silent-corruption bug.
   bool has_message_passing_ = false;
+  // Flat (ntypes+1)^2 model-level pair-type keep table, rebuilt in ``init`` from
+  // the ``pair_exclude_types`` metadata field (see
+  // ``deepmd::buildPairExcludeTable``).  Empty => no model-level exclusion.
+  // Applied at the C++ ingestion seam (``applyPairExclusion`` graph /
+  // ``applyPairExclusionNlist`` dense) as an idempotent backstop; the compiled
+  // .pt2 graph already applies the same transform internally.
+  std::vector<int> pair_exclude_table_;
   std::unique_ptr<deepmd::ptexpt::TempFile> with_comm_tempfile_;
   std::unique_ptr<torch::inductor::AOTIModelPackageLoader> with_comm_loader;
 
diff --git a/source/api_cc/include/commonPT.h b/source/api_cc/include/commonPT.h
index 02c25aa047..c379e9acc9 100644
--- a/source/api_cc/include/commonPT.h
+++ b/source/api_cc/include/commonPT.h
@@ -7,8 +7,10 @@
 #include <algorithm>
 #include <cstdint>
 #include <map>
+#include <set>
 #include <string>
 #include <type_traits>
+#include <utility>
 #include <vector>
 
 #include "common.h"
@@ -462,6 +464,145 @@ inline GraphTensorPack buildGraphTensors(
   return pack;
 }
 
+/**
+ * @brief Build the flat ``(ntypes+1)^2`` pair-type keep table.
+ *
+ * Inference-path mirror of the Python ``PairExcludeMask`` constructor
+ * (``deepmd/dpmodel/utils/exclude_mask.py``).  The table is row-major over
+ * ``[tj][ti]`` (flat index ``tj * (ntypes+1) + ti``); an entry is ``0`` when the
+ * ordered pair ``(ti, tj)`` is excluded and ``1`` otherwise.  Both ``(ti, tj)``
+ * and ``(tj, ti)`` are inserted into the exclude set, so the table is
+ * symmetric.  Type ``ntypes`` is the reserved virtual-atom row/column.
+ *
+ * Returns an empty vector when ``exclude_types`` is empty, so callers can treat
+ * an empty table as "no exclusion" (identity) just like the Python
+ * ``pair_excl is None`` early-exit.
+ *
+ * @param ntypes Number of real atom types.
+ * @param exclude_types List of excluded ``(ti, tj)`` type pairs.
+ */
+inline std::vector<int> buildPairExcludeTable(
+    const int ntypes, const std::vector<std::pair<int, int>>& exclude_types) {
+  if (exclude_types.empty()) {
+    return {};
+  }
+  const int n1 = ntypes + 1;
+  std::set<std::pair<int, int>> excl;
+  for (const auto& tt : exclude_types) {
+    excl.insert({tt.first, tt.second});
+    excl.insert({tt.second, tt.first});
+  }
+  // type_mask[tj][ti] == 0 iff (ti, tj) is excluded (mirrors the Python
+  // list comprehension in PairExcludeMask.__init__, reshape(-1)).
+  std::vector<int> type_mask(static_cast<size_t>(n1) * n1, 1);
+  for (int tj = 0; tj < n1; ++tj) {
+    for (int ti = 0; ti < n1; ++ti) {
+      if (excl.count({ti, tj})) {
+        type_mask[static_cast<size_t>(tj) * n1 + ti] = 0;
+      }
+    }
+  }
+  return type_mask;
+}
+
+/**
+ * @brief Graph pair-type exclusion: AND the per-edge keep-mask into
+ *        ``edge_mask``.
+ *
+ * Inference-path twin of Python ``apply_pair_exclusion`` (mask-only mode) in
+ * ``deepmd/dpmodel/utils/neighbor_graph/graph.py`` +
+ * ``PairExcludeMask.build_edge_exclude_mask``.  Kept side-by-side reviewable:
+ * same argument order (edge_index, edge_mask, atype, ...) and same variable
+ * names (``type_ij``, ``keep``).
+ *
+ * The compiled ``.pt2`` graph already applies this exclusion internally (the
+ * seam transform is traced into the exported forward), so this call is an
+ * idempotent backstop at the C++ ingestion seam.
+ *
+ * @param edge_index (2, E) int64 ``[src, dst]``; src = neighbor, dst = center.
+ * @param edge_mask (E,) bool real-vs-padding mask to be ANDed in place.
+ * @param atype (N,) int64 flat node types (clamped >= 0).
+ * @param type_mask_table Flat ``(ntypes+1)^2`` table from
+ *   ``buildPairExcludeTable``.  Empty => identity (returns ``edge_mask``).
+ * @param ntypes Number of real atom types.
+ * @return New ``edge_mask`` with excluded edges cleared.
+ */
+inline torch::Tensor applyPairExclusion(const torch::Tensor& edge_index,
+                                        const torch::Tensor& edge_mask,
+                                        const torch::Tensor& atype,
+                                        const std::vector<int>& type_mask_table,
+                                        const int ntypes) {
+  if (type_mask_table.empty()) {
+    return edge_mask;
+  }
+  const auto device = edge_mask.device();
+  const auto src = edge_index.index({0});  // (E,) neighbour
+  const auto dst = edge_index.index({1});  // (E,) center
+  const auto src_t = atype.index_select(0, src);
+  const auto dst_t = atype.index_select(0, dst);
+  // type_ij = atype[dst] * (ntypes + 1) + atype[src]  (matches Python)
+  const auto type_ij = dst_t * (ntypes + 1) + src_t;
+  const auto table =
+      torch::from_blob(const_cast<int*>(type_mask_table.data()),
+                       {static_cast<std::int64_t>(type_mask_table.size())},
+                       torch::TensorOptions().dtype(torch::kInt32))
+          .clone()
+          .to(device);
+  const auto keep = table.index_select(0, type_ij).to(torch::kBool);
+  return torch::logical_and(edge_mask, keep);
+}
+
+/**
+ * @brief Dense-nlist pair-type exclusion: erase excluded neighbours to ``-1``.
+ *
+ * Inference-path twin of Python ``apply_pair_exclusion_nlist`` in
+ * ``deepmd/dpmodel/utils/nlist.py`` + ``PairExcludeMask.build_type_exclude_mask``.
+ * Same argument order (nlist, atype_ext, ...) and same variable names
+ * (``type_ij``, ``keep``).  Idempotent: erasing ``-1`` a second time is a no-op.
+ *
+ * @param nlist (nf, nloc, nnei) int64 neighbour list; ``-1`` == empty slot.
+ * @param atype_ext (nf, nall) int64 extended atom types.
+ * @param type_mask_table Flat ``(ntypes+1)^2`` table from
+ *   ``buildPairExcludeTable``.  Empty => identity (returns ``nlist``).
+ * @param ntypes Number of real atom types.
+ * @return New neighbour list with excluded entries set to ``-1``.
+ */
+inline torch::Tensor applyPairExclusionNlist(
+    const torch::Tensor& nlist,
+    const torch::Tensor& atype_ext,
+    const std::vector<int>& type_mask_table,
+    const int ntypes) {
+  if (type_mask_table.empty()) {
+    return nlist;
+  }
+  const auto device = nlist.device();
+  const std::int64_t nf = nlist.size(0);
+  const std::int64_t nloc = nlist.size(1);
+  const std::int64_t nnei = nlist.size(2);
+  const std::int64_t nall = atype_ext.size(1);
+  // center types: first nloc extended atoms.  type_i = atype * (ntypes + 1).
+  const auto type_i = atype_ext.slice(1, 0, nloc) * (ntypes + 1);  // (nf, nloc)
+  // append virtual atom of type ntypes; map -1 neighbours to it.
+  const auto ae = torch::cat(
+      {atype_ext, torch::full({nf, 1}, ntypes, atype_ext.options())}, 1);
+  const auto nlist_for_type =
+      torch::where(nlist == -1, torch::full_like(nlist, nall), nlist);
+  const auto type_j = torch::gather(
+      ae.unsqueeze(1).expand({nf, nloc, nall + 1}), 2, nlist_for_type);
+  // type_ij = type_i * (ntypes + 1) + type_j  (matches Python: type_i already
+  // scaled above; here just add the neighbour type).
+  const auto type_ij = type_i.unsqueeze(2) + type_j;  // (nf, nloc, nnei)
+  const auto table =
+      torch::from_blob(const_cast<int*>(type_mask_table.data()),
+                       {static_cast<std::int64_t>(type_mask_table.size())},
+                       torch::TensorOptions().dtype(torch::kInt32))
+          .clone()
+          .to(device);
+  const auto keep =
+      table.index_select(0, type_ij.reshape({-1})).reshape({nf, nloc, nnei});
+  return torch::where(keep == 1, nlist, torch::full_like(nlist, -1));
+}
+
 /**
  * @brief Remap NeighborGraph (graph-schema) public outputs onto the dense
  *        internal-key layout the rest of ``compute`` consumes.
diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc
index cff23388b6..ad09ae9365 100644
--- a/source/api_cc/src/DeepPotPTExpt.cc
+++ b/source/api_cc/src/DeepPotPTExpt.cc
@@ -200,6 +200,21 @@ void DeepPotPTExpt::init(const std::string& model,
   // scripts and carry the explicit value.
   has_message_passing_ = metadata.obj_val.count("has_message_passing") &&
                          metadata["has_message_passing"].as_bool();
+
+  // Model-level pair-type exclusion table.  ``pair_exclude_types`` is a list of
+  // [ti, tj] pairs; rebuild the flat (ntypes+1)^2 keep table exactly like the
+  // Python ``PairExcludeMask`` ctor so the ingestion seam can re-apply the same
+  // exclusion (idempotent backstop; the compiled graph already applies it).
+  {
+    std::vector<std::pair<int, int>> pair_exclude_types;
+    if (metadata.obj_val.count("pair_exclude_types")) {
+      for (const auto& v : metadata["pair_exclude_types"].as_array()) {
+        pair_exclude_types.emplace_back(v[0].as_int(), v[1].as_int());
+      }
+    }
+    pair_exclude_table_ = deepmd::buildPairExcludeTable(ntypes,
+                                                        pair_exclude_types);
+  }
   if (has_comm_artifact_) {
     try {
       // Extract the nested ``extra/forward_lower_with_comm.pt2`` into a
@@ -875,12 +890,22 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
           torch::full({1}, n_node_count, int_option).to(device);
       at::Tensor node_atype =
           atype_Tensor.slice(1, 0, n_node_count).reshape({n_node_count});
+      // Model-level pair exclusion at the ingestion seam (idempotent backstop;
+      // the compiled graph already applies the same transform internally).
+      const at::Tensor graph_edge_mask =
+          deepmd::applyPairExclusion(edge_tensors.edge_index,
+                                     edge_tensors.edge_mask, node_atype,
+                                     pair_exclude_table_, ntypes);
       flat_outputs =
           run_model_graph(node_atype, n_node_tensor, edge_tensors.edge_index,
-                          edge_tensors.edge_vec, edge_tensors.edge_mask,
-                          fparam_tensor, aparam_tensor, charge_spin_tensor);
+                          edge_tensors.edge_vec, graph_edge_mask, fparam_tensor,
+                          aparam_tensor, charge_spin_tensor);
     } else {
-      flat_outputs = run_model(coord_Tensor, atype_Tensor, firstneigh_tensor,
+      // Model-level pair exclusion at the dense ingestion seam (idempotent
+      // backstop; the compiled dense forward already applies the same erase).
+      const at::Tensor excl_nlist = deepmd::applyPairExclusionNlist(
+          firstneigh_tensor, atype_Tensor, pair_exclude_table_, ntypes);
+      flat_outputs = run_model(coord_Tensor, atype_Tensor, excl_nlist,
                                mapping_tensor, fparam_tensor, aparam_tensor,
                                charge_spin_tensor);
     }
@@ -1238,13 +1263,22 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
                         edge_tensors.edge_index_ext, edge_tensors.edge_mask,
                         fparam_tensor, aparam_tensor, charge_spin_tensor);
   } else if (lower_input_is_graph_) {
+    // Model-level pair exclusion at the ingestion seam (idempotent backstop;
+    // the compiled graph already applies the same transform internally).
+    const at::Tensor graph_edge_mask = deepmd::applyPairExclusion(
+        graph_tensors.edge_index, graph_tensors.edge_mask, graph_tensors.atype,
+        pair_exclude_table_, ntypes);
     flat_outputs = run_model_graph(
         graph_tensors.atype, graph_tensors.n_node, graph_tensors.edge_index,
-        graph_tensors.edge_vec, graph_tensors.edge_mask, fparam_tensor,
-        aparam_tensor, charge_spin_tensor);
+        graph_tensors.edge_vec, graph_edge_mask, fparam_tensor, aparam_tensor,
+        charge_spin_tensor);
   } else {
+    // Model-level pair exclusion at the dense ingestion seam (idempotent
+    // backstop; the compiled dense forward already applies the same erase).
+    const at::Tensor excl_nlist = deepmd::applyPairExclusionNlist(
+        nlist_tensor, atype_Tensor, pair_exclude_table_, ntypes);
     flat_outputs =
-        run_model(coord_Tensor, atype_Tensor, nlist_tensor, mapping_tensor,
+        run_model(coord_Tensor, atype_Tensor, excl_nlist, mapping_tensor,
                   fparam_tensor, aparam_tensor, charge_spin_tensor);
   }
 

From 500e2c15a50ca4ad46a4d2f974ba544794244b29 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 02:32:09 +0800
Subject: [PATCH 33/38] test(api_cc): gtest + gen script for C++ pair-exclusion
 seam (dpa1)

gen_dpa1_pairexcl.py exports graph-route and dense-route DPA1(attn_layer=0)
.pt2 models with model-level pair_exclude_types=[[0,1]] plus a no-exclusion
baseline; references come from Python DeepEval of each model.

test_deeppot_dpa1_pairexcl_ptexpt.cc validates both C++ ingestion routes
(applyPairExclusion / applyPairExclusionNlist) against the Python references at
1e-10 (fp64), cross-checks graph==dense, and proves the exclusion is active by
comparing against the empty-table baseline. Wired into test_cc_local.sh.
8/8 tests pass locally.
---
 .../test_deeppot_dpa1_pairexcl_ptexpt.cc      | 176 ++++++++++++++++
 source/install/test_cc_local.sh               |   3 +
 source/tests/infer/gen_dpa1_pairexcl.py       | 188 ++++++++++++++++++
 3 files changed, 367 insertions(+)
 create mode 100644 source/api_cc/tests/test_deeppot_dpa1_pairexcl_ptexpt.cc
 create mode 100644 source/tests/infer/gen_dpa1_pairexcl.py

diff --git a/source/api_cc/tests/test_deeppot_dpa1_pairexcl_ptexpt.cc b/source/api_cc/tests/test_deeppot_dpa1_pairexcl_ptexpt.cc
new file mode 100644
index 0000000000..18a3016b03
--- /dev/null
+++ b/source/api_cc/tests/test_deeppot_dpa1_pairexcl_ptexpt.cc
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+// Test the C++ model-level pair-exclusion ingestion seam of the pt_expt
+// backend (Task A3/A4).  Two DPA1(attn_layer=0) models with identical weights
+// and model-level pair_exclude_types=[[0,1]] are exported, one through each C++
+// ingestion route:
+//   - deeppot_dpa1_pairexcl_graph.pt2 -> applyPairExclusion (graph route)
+//   - deeppot_dpa1_pairexcl_nlist.pt2 -> applyPairExclusionNlist (dense route)
+// A no-exclusion baseline (deeppot_dpa1_pairexcl_none.pt2, empty exclude table)
+// exercises the identity/pre-change branch of both helpers.
+//
+// The compiled .pt2 forward ALREADY applies the exclusion internally (the seam
+// transform is traced into the exported artifact), so the C++ helpers are an
+// idempotent backstop; the reference values (.expected sidecars) come from the
+// Python DeepEval of the SAME .pt2, so a 1e-10 match validates the whole chain
+// (pair_exclude_types metadata round-trip + init table build + seam apply +
+// compiled math).  A separate assertion (excluded energy != baseline energy)
+// proves the exclusion is genuinely active and not silently dropped.
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <vector>
+
+#include "DeepPot.h"
+#include "DeepPotPTExpt.h"
+#include "expected_ref.h"
+#include "test_utils.h"
+
+namespace {
+constexpr const char* kGraphModel =
+    "../../tests/infer/deeppot_dpa1_pairexcl_graph.pt2";
+constexpr const char* kNlistModel =
+    "../../tests/infer/deeppot_dpa1_pairexcl_nlist.pt2";
+constexpr const char* kNoneModel =
+    "../../tests/infer/deeppot_dpa1_pairexcl_none.pt2";
+constexpr const char* kGraphRef =
+    "../../tests/infer/deeppot_dpa1_pairexcl_graph.expected";
+constexpr const char* kNlistRef =
+    "../../tests/infer/deeppot_dpa1_pairexcl_nlist.expected";
+}  // namespace
+
+template <class VALUETYPE>
+class TestInferDpa1PairExclPtExpt : public ::testing::Test {
+ protected:
+  std::vector<VALUETYPE> coord = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74,
+                                  00.25, 3.32, 1.68, 3.36,  3.00, 1.81,
+                                  3.51,  2.51, 2.60, 4.27,  3.22, 1.56};
+  std::vector<int> atype = {0, 1, 1, 0, 1, 1};
+  std::vector<VALUETYPE> box = {13., 0., 0., 0., 13., 0., 0., 0., 13.};
+
+  // Excluded models (one per ingestion route) + no-exclusion baseline.
+  static deepmd::DeepPot dp_graph;
+  static deepmd::DeepPot dp_nlist;
+  static deepmd::DeepPot dp_none;
+
+  static void SetUpTestSuite() {
+#if defined(BUILD_PYTORCH) && BUILD_PT_EXPT
+    dp_graph.init(kGraphModel);
+    dp_nlist.init(kNlistModel);
+    dp_none.init(kNoneModel);
+#endif
+  }
+
+  static void TearDownTestSuite() {
+    dp_graph = deepmd::DeepPot();
+    dp_nlist = deepmd::DeepPot();
+    dp_none = deepmd::DeepPot();
+  }
+
+  void SetUp() override {
+#if !defined(BUILD_PYTORCH) || !BUILD_PT_EXPT
+    GTEST_SKIP() << "Skip because PyTorch support is not enabled.";
+#endif
+  }
+
+  // Load per-atom reference from a .expected sidecar and reduce to totals.
+  void load_ref(const char* path,
+                double& tot_e,
+                std::vector<VALUETYPE>& per_f,
+                std::vector<VALUETYPE>& tot_v,
+                int& natoms) {
+    deepmd_test::ExpectedRef ref;
+    ref.load(path);
+    const auto per_e = ref.get<VALUETYPE>("pbc", "expected_e");
+    per_f = ref.get<VALUETYPE>("pbc", "expected_f");
+    const auto per_v = ref.get<VALUETYPE>("pbc", "expected_v");
+    natoms = per_e.size();
+    tot_e = 0.;
+    for (int ii = 0; ii < natoms; ++ii) {
+      tot_e += per_e[ii];
+    }
+    tot_v.assign(9, 0.);
+    for (int ii = 0; ii < natoms; ++ii) {
+      for (int dd = 0; dd < 9; ++dd) {
+        tot_v[dd] += per_v[ii * 9 + dd];
+      }
+    }
+  }
+
+  // Run one model through the standalone build-nlist path and check it against
+  // its Python DeepEval reference at EPSILON.
+  void check_against_ref(deepmd::DeepPot& dp, const char* ref_path) {
+    double tot_e;
+    std::vector<VALUETYPE> per_f, tot_v;
+    int natoms;
+    load_ref(ref_path, tot_e, per_f, tot_v, natoms);
+
+    double ener;
+    std::vector<VALUETYPE> force, virial;
+    dp.compute(ener, force, virial, coord, atype, box);
+
+    EXPECT_EQ(force.size(), static_cast<size_t>(natoms * 3));
+    EXPECT_EQ(virial.size(), 9u);
+    EXPECT_LT(fabs(ener - tot_e), EPSILON);
+    for (int ii = 0; ii < natoms * 3; ++ii) {
+      EXPECT_LT(fabs(force[ii] - per_f[ii]), EPSILON);
+    }
+    for (int ii = 0; ii < 9; ++ii) {
+      EXPECT_LT(fabs(virial[ii] - tot_v[ii]), EPSILON);
+    }
+  }
+};
+
+template <class VALUETYPE>
+deepmd::DeepPot TestInferDpa1PairExclPtExpt<VALUETYPE>::dp_graph;
+template <class VALUETYPE>
+deepmd::DeepPot TestInferDpa1PairExclPtExpt<VALUETYPE>::dp_nlist;
+template <class VALUETYPE>
+deepmd::DeepPot TestInferDpa1PairExclPtExpt<VALUETYPE>::dp_none;
+
+TYPED_TEST_SUITE(TestInferDpa1PairExclPtExpt, ValueTypes);
+
+// Graph route: applyPairExclusion at the ingestion seam + compiled exclusion.
+TYPED_TEST(TestInferDpa1PairExclPtExpt, graph_route_matches_python_ref) {
+  this->check_against_ref(this->dp_graph, kGraphRef);
+}
+
+// Dense route: applyPairExclusionNlist at the ingestion seam + compiled
+// exclusion.
+TYPED_TEST(TestInferDpa1PairExclPtExpt, nlist_route_matches_python_ref) {
+  this->check_against_ref(this->dp_nlist, kNlistRef);
+}
+
+// The two ingestion routes carry the SAME weights and exclusion, so at
+// non-binding sel they must agree bit-for-bit (fp64 ~1e-10).
+TYPED_TEST(TestInferDpa1PairExclPtExpt, graph_equals_nlist_route) {
+  using VALUETYPE = TypeParam;
+  double e_g, e_n;
+  std::vector<VALUETYPE> f_g, v_g, f_n, v_n;
+  this->dp_graph.compute(e_g, f_g, v_g, this->coord, this->atype, this->box);
+  this->dp_nlist.compute(e_n, f_n, v_n, this->coord, this->atype, this->box);
+  EXPECT_LT(fabs(e_g - e_n), EPSILON);
+  ASSERT_EQ(f_g.size(), f_n.size());
+  for (size_t ii = 0; ii < f_g.size(); ++ii) {
+    EXPECT_LT(fabs(f_g[ii] - f_n[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 9; ++ii) {
+    EXPECT_LT(fabs(v_g[ii] - v_n[ii]), EPSILON);
+  }
+}
+
+// The no-exclusion baseline exercises the EMPTY-table (identity) branch of the
+// C++ helpers; it must run cleanly and produce an energy that DIFFERS from the
+// excluded models (proving pair_exclude_types is genuinely active, not dropped).
+TYPED_TEST(TestInferDpa1PairExclPtExpt, exclusion_is_active_vs_baseline) {
+  using VALUETYPE = TypeParam;
+  double e_none, e_g, e_n;
+  std::vector<VALUETYPE> f, v;
+  this->dp_none.compute(e_none, f, v, this->coord, this->atype, this->box);
+  this->dp_graph.compute(e_g, f, v, this->coord, this->atype, this->box);
+  this->dp_nlist.compute(e_n, f, v, this->coord, this->atype, this->box);
+
+  EXPECT_TRUE(std::isfinite(e_none));
+  // Excluding all O-H pairs changes the energy well above the fp64 tolerance.
+  EXPECT_GT(fabs(e_g - e_none), 1e-6);
+  EXPECT_GT(fabs(e_n - e_none), 1e-6);
+}
diff --git a/source/install/test_cc_local.sh b/source/install/test_cc_local.sh
index 2247dcb46c..2b1c7fddb2 100755
--- a/source/install/test_cc_local.sh
+++ b/source/install/test_cc_local.sh
@@ -96,7 +96,10 @@ else:
 
 	env ${_GEN_ENV} python ${INFER_SCRIPT_PATH}/gen_dpa4.py &
 	PID9=$!
+	env ${_GEN_ENV} python ${INFER_SCRIPT_PATH}/gen_dpa1_pairexcl.py &
+	PID10=$!
 	wait $PID9
+	wait $PID10
 
 	env ${_GEN_ENV} python ${INFER_SCRIPT_PATH}/gen_spin.py &
 	PID7=$!
diff --git a/source/tests/infer/gen_dpa1_pairexcl.py b/source/tests/infer/gen_dpa1_pairexcl.py
new file mode 100644
index 0000000000..609ad91ddd
--- /dev/null
+++ b/source/tests/infer/gen_dpa1_pairexcl.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Generate DPA1 test models carrying model-level ``pair_exclude_types``.
+
+Produces two graph-eligible DPA1(attn_layer=0) models with identical weights,
+one exported through each C++ ingestion route, plus a no-exclusion baseline:
+
+  - deeppot_dpa1_pairexcl_graph.pt2  (lower_kind="graph", pair_exclude=[[0,1]])
+  - deeppot_dpa1_pairexcl_nlist.pt2  (lower_kind="nlist", pair_exclude=[[0,1]])
+  - deeppot_dpa1_pairexcl_none.pt2   (lower_kind="graph", NO exclusion)
+
+The pair models exercise the C++ pair-exclusion ingestion seam:
+``applyPairExclusion`` (graph route) / ``applyPairExclusionNlist`` (dense route)
+plus the ``pair_exclude_types`` metadata round-trip in ``DeepPotPTExpt::init``.
+The compiled ``.pt2`` forward ALREADY applies the exclusion internally (the seam
+transform is traced into the exported artifact), so the C++ seam is an
+idempotent backstop; the gtest validates C++ energy/force vs the Python
+``DeepEval`` reference at 1e-10 and, by comparing against the ``_none`` baseline,
+confirms the exclusion is actually active.
+
+Reference sidecar files (.expected) consumed by the C++ gtest are written from
+the Python ``DeepEval`` evaluation of each pair model (PBC + NoPBC sections).
+
+exclude_types = [[0, 1]] drops every O-H (cross-type) interaction while keeping
+O-O and H-H, so both energy and forces change measurably but stay non-degenerate
+(H-H pairs survive).
+
+This is a SEPARATE script from ``gen_dpa1.py`` so it can be run independently on
+boxes where the unrelated attn_layer=2 model in ``gen_dpa1.py`` trips a torch
+inductor codegen bug.
+"""
+
+import copy
+import os
+import sys
+
+import numpy as np
+
+# Ensure the source tree is on the path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+
+from gen_common import (
+    ensure_inductor_compiler,
+    load_custom_ops,
+    write_expected_ref,
+)
+
+
+def main():
+    from deepmd.dpmodel.model.model import (
+        get_model,
+    )
+
+    ensure_inductor_compiler()
+
+    base_dir = os.path.dirname(__file__)
+
+    # Graph-eligible DPA1 (attn_layer=0); same descriptor as gen_dpa1.py
+    # Section B so the two fixtures stay comparable.
+    base_config = {
+        "type_map": ["O", "H"],
+        "descriptor": {
+            "type": "se_atten",
+            "sel": 30,
+            "rcut_smth": 2.0,
+            "rcut": 6.0,
+            "neuron": [2, 4, 8],
+            "axis_neuron": 4,
+            "attn": 5,
+            "attn_layer": 0,
+            "attn_dotr": True,
+            "attn_mask": False,
+            "activation_function": "tanh",
+            "scaling_factor": 1.0,
+            "normalize": True,
+            "temperature": 1.0,
+            "type_one_side": True,
+            "seed": 1,
+        },
+        "fitting_net": {
+            "neuron": [5, 5, 5],
+            "resnet_dt": True,
+            "seed": 1,
+        },
+    }
+
+    from deepmd.pt_expt.utils.serialization import (
+        deserialize_to_file as pt_expt_deserialize_to_file,
+    )
+
+    load_custom_ops()
+
+    from deepmd.infer import (
+        DeepPot,
+    )
+
+    coord = np.array(
+        [
+            12.83, 2.56, 2.18, 12.09, 2.87, 2.74, 0.25, 3.32, 1.68,
+            3.36, 3.00, 1.81, 3.51, 2.51, 2.60, 4.27, 3.22, 1.56,
+        ],
+        dtype=np.float64,
+    )  # fmt: skip
+    atype = [0, 1, 1, 0, 1, 1]
+    box = np.array([13.0, 0.0, 0.0, 0.0, 13.0, 0.0, 0.0, 0.0, 13.0], dtype=np.float64)
+
+    def build_data(exclude_types):
+        cfg = copy.deepcopy(base_config)
+        if exclude_types:
+            cfg["pair_exclude_types"] = exclude_types
+        model = get_model(copy.deepcopy(cfg))
+        return {
+            "model": copy.deepcopy(model.serialize()),
+            "model_def_script": cfg,
+            "backend": "dpmodel",
+            "software": "deepmd-kit",
+            "version": "3.0.0",
+        }
+
+    # ---- No-exclusion baseline (graph route) ----
+    none_pt2 = os.path.join(base_dir, "deeppot_dpa1_pairexcl_none.pt2")
+    print(f"Exporting no-exclusion baseline to {none_pt2} ...")  # noqa: T201
+    pt_expt_deserialize_to_file(
+        none_pt2, build_data(None), do_atomic_virial=True, lower_kind="graph"
+    )
+    dp_none = DeepPot(none_pt2)
+    e_none = dp_none.eval(coord, box, atype, atomic=False)[0][0, 0]
+    print(f"  baseline PBC energy: {e_none:.18e}")  # noqa: T201
+
+    # ---- Pair-exclusion models (graph + dense routes) ----
+    exclude_types = [[0, 1]]
+    data_e = build_data(exclude_types)
+    for lower_kind, tag in (("graph", "graph"), ("nlist", "nlist")):
+        pt2_path = os.path.join(base_dir, f"deeppot_dpa1_pairexcl_{tag}.pt2")
+        print(  # noqa: T201
+            f"Exporting to {pt2_path} (lower_kind='{lower_kind}', "
+            f"pair_exclude_types={exclude_types}) ..."
+        )
+        pt_expt_deserialize_to_file(
+            pt2_path,
+            copy.deepcopy(data_e),
+            do_atomic_virial=True,
+            lower_kind=lower_kind,
+        )
+        dp_e = DeepPot(pt2_path)
+        e_e1, f_e1, v_e1, ae_e1, av_e1 = dp_e.eval(coord, box, atype, atomic=True)
+        e_enp, f_enp, v_enp, ae_enp, av_enp = dp_e.eval(coord, None, atype, atomic=True)
+
+        # Confirm the exclusion is ACTIVE: energy must differ from the
+        # no-exclusion baseline (identical weights minus pair_exclude_types).
+        e_diff = float(abs(e_e1[0, 0] - e_none))
+        print(f"  {tag}: |E(excl) - E(no-excl)| = {e_diff:.3e}")  # noqa: T201
+        if e_diff < 1e-6:
+            raise RuntimeError(
+                f"BLOCKED: pair_exclude_types had no effect on the {tag} model "
+                f"(energy delta {e_diff:.2e} < 1e-6); exclusion may be dropped."
+            )
+        f_max = float(np.max(np.abs(f_e1)))
+        if f_max < 1e-10:
+            raise RuntimeError(
+                f"Pair-exclude {tag} forces are degenerate (max={f_max:.2e})."
+            )
+
+        ref_path = os.path.join(base_dir, f"deeppot_dpa1_pairexcl_{tag}.expected")
+        write_expected_ref(
+            ref_path,
+            sections={
+                "pbc": {
+                    "expected_e": ae_e1[0, :, 0],
+                    "expected_f": f_e1[0],
+                    "expected_v": av_e1[0],
+                },
+                "nopbc": {
+                    "expected_e": ae_enp[0, :, 0],
+                    "expected_f": f_enp[0],
+                    "expected_v": av_enp[0],
+                },
+            },
+            source_script="source/tests/infer/gen_dpa1_pairexcl.py",
+        )
+        print(f"  Wrote {ref_path}")  # noqa: T201
+
+    print("\nAll pair-exclude models generated.")  # noqa: T201
+    print("Done!")  # noqa: T201
+
+
+if __name__ == "__main__":
+    main()

From 5adae7e1218b501a2811cade6c1535a4956bd944 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 03:02:14 +0800
Subject: [PATCH 34/38] docs: exclude_types is graph-native; fix stale comments
 in _call_dense and forward_common_atomic_graph

---
 deepmd/dpmodel/atomic_model/base_atomic_model.py | 2 +-
 deepmd/dpmodel/descriptor/dpa1.py                | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/deepmd/dpmodel/atomic_model/base_atomic_model.py b/deepmd/dpmodel/atomic_model/base_atomic_model.py
index cbe0bdf5d4..240d4d6720 100644
--- a/deepmd/dpmodel/atomic_model/base_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/base_atomic_model.py
@@ -333,7 +333,7 @@ def forward_common_atomic_graph(
         ``self.pair_excl is not None``, an edge-keep mask is ANDed into
         ``graph.edge_mask`` before the descriptor forward, so excluded type-pairs
         contribute zero to the segment_sum. Descriptor-level ``exclude_types`` is
-        gated by ``uses_graph_lower()==False``.
+        handled inside the descriptor's ``call_graph`` (graph-native).
 
         Parameters
         ----------
diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index fbd71d3f7c..b33a512297 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -684,8 +684,8 @@ def _call_dense(
         atype_ext: Array,
         nlist: Array,
     ) -> Array:
-        """Legacy dense descriptor body (the ineligible ``call`` path: attention,
-        strip tebd, exclude_types, or the no-mapping ghost case).
+        """Legacy dense descriptor body (the ineligible ``call`` path:
+        strip tebd or the no-mapping ghost case).
 
         Parameters
         ----------

From a0e681a3d1a669cf121af9bc80a6c6a00d428e35 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 03:39:55 +0800
Subject: [PATCH 35/38] fix(spin): force legacy dense routing in
 SpinModel.call_common

The SpinModel backbone (dpa1 attn_layer=0) was being routed to the
carry-all graph path by the default-flip (decision #17) introduced in
the graph-pair-exclude branch.  Virtual/placeholder types injected by
get_spin_model double the atom density, making the system sel-binding;
the carry-all graph keeps neighbors the capped dense nlist discards,
so SpinModel.call_common diverged from call_common_lower (the dense
lower used by pt_expt eager inference) by ~2e-6.

Fix: pass neighbor_graph_method='legacy' when SpinModel.call_common
invokes the backbone, forcing the dense-nlist path until spin-graph
support is explicitly implemented.

The graph .pt2 export path already has a fail-fast guard for spin
(serialization.py:963).

Adds a regression test pinning that:
- SpinModel.call_common total energy equals backbone(legacy) on the
  same spin-doubled inputs (exact bit-identity).
- The backbone in graph mode gives a DIFFERENT energy at this density,
  confirming the fixture exercises the diverging regime.
---
 deepmd/dpmodel/model/spin_model.py            |   5 +
 .../dpmodel/test_spin_model_legacy_routing.py | 154 ++++++++++++++++++
 2 files changed, 159 insertions(+)
 create mode 100644 source/tests/common/dpmodel/test_spin_model_legacy_routing.py

diff --git a/deepmd/dpmodel/model/spin_model.py b/deepmd/dpmodel/model/spin_model.py
index 373294bece..6a30434a58 100644
--- a/deepmd/dpmodel/model/spin_model.py
+++ b/deepmd/dpmodel/model/spin_model.py
@@ -628,6 +628,11 @@ def call_common(
             charge_spin=charge_spin,
             do_atomic_virial=do_atomic_virial,
             coord_corr_for_virial=coord_corr_for_virial,
+            # Spin graph support is not yet implemented; the carry-all graph
+            # route diverges on sel-binding spin systems (virtual atoms double
+            # the density).  Force the legacy dense-nlist path until spin-graph
+            # support lands.
+            neighbor_graph_method="legacy",
         )
         model_output_type = self.backbone_model.model_output_type()
         if "mask" in model_output_type:
diff --git a/source/tests/common/dpmodel/test_spin_model_legacy_routing.py b/source/tests/common/dpmodel/test_spin_model_legacy_routing.py
new file mode 100644
index 0000000000..c63efc6f64
--- /dev/null
+++ b/source/tests/common/dpmodel/test_spin_model_legacy_routing.py
@@ -0,0 +1,154 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Regression test: SpinModel backbone must stay on the legacy dense-nlist path.
+
+A spin system uses virtual/placeholder types whose pair exclusions double the
+effective atom density.  Before the fix (commit 6c2b007c9), the dpmodel
+``call_common`` auto-flip (decision #17) moved graph-eligible mixed_types
+backbones to the carry-all graph route.  When a spin model's backbone
+(dpa1 attn_layer=0) crossed to the graph route, its ``call_common`` diverged
+from the dense lower interface (``call_common_lower``) used by pt_expt eager
+inference -- the carry-all graph keeps neighbors the capped dense nlist
+discards.
+
+Fix: ``SpinModel.call_common`` forces ``neighbor_graph_method="legacy"`` on
+the backbone call so the spin backbone always runs dense regardless of the
+default flip.
+
+This test pins that contract:
+
+1. ``SpinModel.call_common`` energy == backbone energy computed with explicit
+   ``neighbor_graph_method="legacy"`` on the spin-doubled coordinate inputs.
+2. The backbone in graph mode (``neighbor_graph_method="ase"``) on the same
+   doubled inputs gives a DIFFERENT energy, confirming the fixture is
+   sel-binding (i.e., the virtual-atom density really does trigger divergence).
+"""
+
+import numpy as np
+import pytest
+
+from deepmd.dpmodel.model.model import (
+    get_spin_model,
+)
+
+
+def _spin_dpa1_config() -> dict:
+    """Minimal dpa1-backed spin model config (sel-binding on doubled inputs)."""
+    return {
+        "type": "standard",
+        "type_map": ["Fe", "H"],
+        "spin": {
+            "use_spin": [True, False],
+            "virtual_scale": [0.3],
+        },
+        "descriptor": {
+            "type": "dpa1",
+            "rcut": 4.0,
+            "rcut_smth": 0.5,
+            # Small sel: with 8 real + 8 virtual = 16 atoms in a 6 Å box,
+            # sel=8 is well below the average neighbor count → sel-binding.
+            "sel": 8,
+            "ntypes": 4,  # expanded to 4 by get_spin_model (2 real + 2 virtual)
+            "attn_layer": 0,
+            "axis_neuron": 2,
+            "neuron": [4, 8],
+            "seed": 0,
+        },
+        "fitting_net": {
+            "type": "ener",
+            "neuron": [4, 4],
+            "seed": 0,
+        },
+    }
+
+
+def _make_test_frame(rng: np.random.Generator):
+    """Return (coord, atype, box) for a small PBC cell with 8 atoms."""
+    natoms = 8  # 4 Fe + 4 H
+    coord = rng.random((1, natoms, 3)) * 4.0  # 1 frame
+    atype = np.array([[0, 0, 0, 0, 1, 1, 1, 1]])
+    box = np.eye(3).reshape(1, 9) * 6.0
+    return coord, atype, box
+
+
+def test_spin_model_backbone_routes_legacy() -> None:
+    """SpinModel.call_common energy must equal backbone with explicit legacy.
+
+    Procedure:
+    - Build the SpinModel and obtain the doubled coords/atypes via
+      ``process_spin_input`` (the same transform used internally).
+    - Call the backbone directly with ``neighbor_graph_method="legacy"`` on
+      those doubled inputs to get the expected dense-path energy.
+    - Call ``SpinModel.call_common`` and extract its energy.
+    - Assert the two energies are exactly equal.
+    - Additionally assert the backbone in ``neighbor_graph_method="ase"`` mode
+      on the same doubled inputs gives a DIFFERENT energy (sel-binding guard).
+    """
+    pytest.importorskip("ase")  # ase builder needed for divergence check
+
+    rng = np.random.default_rng(42)
+    coord, atype, box = _make_test_frame(rng)
+    spin = np.zeros_like(coord)
+    spin[:, :4, 2] = 1.0  # spin-z on Fe atoms only
+
+    model = get_spin_model(_spin_dpa1_config())
+    backbone = model.backbone_model
+
+    # --- Get doubled inputs via the model's own transform ---
+    coord_doubled, atype_doubled, _corr = model.process_spin_input(coord, atype, spin)
+
+    # --- Backbone with explicit legacy routing ---
+    legacy_ret = backbone.call_common(
+        coord_doubled, atype_doubled, box, neighbor_graph_method="legacy"
+    )
+    legacy_energy = np.array(legacy_ret["energy"])
+
+    # --- SpinModel.call_common (must route legacy internally) ---
+    spin_ret = model.call_common(coord, atype, spin, box)
+    spin_energy = np.array(spin_ret["energy"])
+
+    # ``energy`` here is per-atom energy; the backbone returns all 2*nloc
+    # atoms while the spin model truncates to the first nloc real atoms.
+    # Compare the total energy (sum over all atoms) which should be equal
+    # because the virtual-atom energies are zeroed by the exclusion mask.
+    np.testing.assert_allclose(
+        float(spin_energy.sum()),
+        float(legacy_energy.sum()),
+        rtol=0,
+        atol=0,
+        err_msg=(
+            "SpinModel.call_common total energy does not match backbone(legacy) "
+            "on doubled inputs; the spin model may be routing through the "
+            "carry-all graph instead of the legacy dense path."
+        ),
+    )
+
+    # --- Sel-binding guard: backbone graph must DIFFER from backbone legacy ---
+    graph_ret = backbone.call_common(
+        coord_doubled, atype_doubled, box, neighbor_graph_method="ase"
+    )
+    graph_energy = np.array(graph_ret["energy"])
+
+    assert not np.allclose(legacy_energy.sum(), graph_energy.sum(), rtol=1e-6), (
+        "Backbone legacy and graph give the same energy on the doubled spin "
+        "system — sel is not binding with these inputs; the regression fixture "
+        "is too weak.  Reduce sel or increase atom density."
+    )
+
+
+def test_spin_model_call_common_deterministic() -> None:
+    """SpinModel.call_common is deterministic (no stochastic routing)."""
+    rng = np.random.default_rng(7)
+    coord, atype, box = _make_test_frame(rng)
+    spin = np.zeros_like(coord)
+    spin[:, :4, 2] = 1.0
+
+    model = get_spin_model(_spin_dpa1_config())
+
+    ret1 = model.call_common(coord, atype, spin, box)
+    ret2 = model.call_common(coord, atype, spin, box)
+
+    np.testing.assert_array_equal(
+        float(np.array(ret1["energy"]).sum()),
+        float(np.array(ret2["energy"]).sum()),
+        err_msg="SpinModel.call_common is non-deterministic",
+    )

From 6245bcad03c9d83767a2e6680c8a792e422830d2 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 03:54:29 +0800
Subject: [PATCH 36/38] fix(spin): explicit graph-lower opt-out on backbone
 descriptor

The pair-exclude branch removed exclude_types from
DescrptDPA1.uses_graph_lower(), which previously kept spin backbones
(they inject exclude_types) on the dense path. As a side effect the
descriptor-level dispatch routed spin .pt2/.pte export through the
graph kernel, whose scatter/atomic_add tripped a torch-inductor CPU
codegen assertion (23 errors in test_deep_eval_spin.py).

Add an explicit disable_graph_lower() knob on DescrptDPA1 and set it
structurally in SpinModel.__init__ (covers get_spin_model and both
dpmodel/pt_expt deserialize paths, so it survives serialize round
trips). The flag is not serialized; re-derived at construction.
The neighbor_graph_method=legacy kwarg in call_common is kept as
belt-and-braces.
---
 deepmd/dpmodel/descriptor/dpa1.py             | 19 ++++++++
 deepmd/dpmodel/model/spin_model.py            | 15 ++++++-
 .../dpmodel/test_spin_model_legacy_routing.py | 43 +++++++++++++++++--
 3 files changed, 72 insertions(+), 5 deletions(-)

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index b33a512297..3a838f4df9 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -355,6 +355,9 @@ def __init__(
         self.tebd_compress = False
         self.geo_compress = False
         self.compress = False
+        # When set, force the legacy dense lower even if the config would
+        # otherwise be graph-lower eligible (see ``disable_graph_lower``).
+        self._graph_lower_disabled = False
 
     def get_rcut(self) -> float:
         """Returns the cut-off radius."""
@@ -442,8 +445,24 @@ def uses_graph_lower(self) -> bool:
         differs from the dense lower by up to ~1e-4 (see the Notes of
         :meth:`call_graph`).
         """
+        if self._graph_lower_disabled:
+            return False
         return self.se_atten.tebd_input_mode == "concat"
 
+    def disable_graph_lower(self) -> None:
+        """Force the legacy dense lower for this descriptor.
+
+        This is an explicit opt-out knob used by contexts where the
+        graph-native lower is unsupported or undesirable (e.g. spin models,
+        whose carry-all routing diverges on sel-binding spin systems and
+        whose ``.pt2``/``.pte`` export trips a torch-inductor scatter/
+        atomic_add CPU codegen assertion).  After calling this,
+        :meth:`uses_graph_lower` returns ``False`` regardless of the
+        descriptor configuration.  The flag is not serialized; it is
+        re-derived structurally at spin-model construction/deserialization.
+        """
+        self._graph_lower_disabled = True
+
     def share_params(
         self, base_class: "DescrptDPA1", shared_level: int, resume: bool = False
     ) -> NoReturn:
diff --git a/deepmd/dpmodel/model/spin_model.py b/deepmd/dpmodel/model/spin_model.py
index 6a30434a58..fa3863320c 100644
--- a/deepmd/dpmodel/model/spin_model.py
+++ b/deepmd/dpmodel/model/spin_model.py
@@ -63,6 +63,16 @@ def __init__(
         super().__init__()
         self.backbone_model = backbone_model
         self.spin = spin
+        # Spin graph-lower unsupported: carry-all routing diverges on
+        # sel-binding spin systems and spin export trips inductor scatter
+        # codegen. Re-derived structurally here so it survives both
+        # construction and serialize/deserialize round trips (the flag is
+        # not part of the serialized schema).
+        dp_atomic_model = self.backbone_model.get_dp_atomic_model()
+        if dp_atomic_model is not None:
+            descriptor = getattr(dp_atomic_model, "descriptor", None)
+            if descriptor is not None and hasattr(descriptor, "disable_graph_lower"):
+                descriptor.disable_graph_lower()
         self.ntypes_real = self.spin.ntypes_real
         self.virtual_scale_mask = self.spin.get_virtual_scale_mask()
         self.spin_mask = self.spin.get_spin_mask()
@@ -630,8 +640,9 @@ def call_common(
             coord_corr_for_virial=coord_corr_for_virial,
             # Spin graph support is not yet implemented; the carry-all graph
             # route diverges on sel-binding spin systems (virtual atoms double
-            # the density).  Force the legacy dense-nlist path until spin-graph
-            # support lands.
+            # the density).  Belt-and-braces: the backbone descriptor already
+            # has graph-lower disabled in ``__init__``, but force the legacy
+            # dense-nlist path here too until spin-graph support lands.
             neighbor_graph_method="legacy",
         )
         model_output_type = self.backbone_model.model_output_type()
diff --git a/source/tests/common/dpmodel/test_spin_model_legacy_routing.py b/source/tests/common/dpmodel/test_spin_model_legacy_routing.py
index c63efc6f64..1a309a1ca4 100644
--- a/source/tests/common/dpmodel/test_spin_model_legacy_routing.py
+++ b/source/tests/common/dpmodel/test_spin_model_legacy_routing.py
@@ -96,6 +96,12 @@ def test_spin_model_backbone_routes_legacy() -> None:
     # --- Get doubled inputs via the model's own transform ---
     coord_doubled, atype_doubled, _corr = model.process_spin_input(coord, atype, spin)
 
+    # The backbone descriptor has graph-lower disabled by the spin-model
+    # opt-out knob.  Temporarily re-enable it only for the sel-binding
+    # divergence probe below; the legacy/spin comparison uses the dense path.
+    backbone_descriptor = backbone.get_dp_atomic_model().descriptor
+    assert backbone_descriptor._graph_lower_disabled is True
+
     # --- Backbone with explicit legacy routing ---
     legacy_ret = backbone.call_common(
         coord_doubled, atype_doubled, box, neighbor_graph_method="legacy"
@@ -123,9 +129,14 @@ def test_spin_model_backbone_routes_legacy() -> None:
     )
 
     # --- Sel-binding guard: backbone graph must DIFFER from backbone legacy ---
-    graph_ret = backbone.call_common(
-        coord_doubled, atype_doubled, box, neighbor_graph_method="ase"
-    )
+    # Re-enable graph lower on the backbone descriptor only for this probe.
+    backbone_descriptor._graph_lower_disabled = False
+    try:
+        graph_ret = backbone.call_common(
+            coord_doubled, atype_doubled, box, neighbor_graph_method="ase"
+        )
+    finally:
+        backbone_descriptor._graph_lower_disabled = True
     graph_energy = np.array(graph_ret["energy"])
 
     assert not np.allclose(legacy_energy.sum(), graph_energy.sum(), rtol=1e-6), (
@@ -135,6 +146,32 @@ def test_spin_model_backbone_routes_legacy() -> None:
     )
 
 
+def test_spin_backbone_descriptor_graph_lower_disabled() -> None:
+    """The spin backbone descriptor must have graph-lower disabled.
+
+    The explicit ``disable_graph_lower`` opt-out knob is set structurally at
+    spin-model construction and must survive a serialize -> deserialize round
+    trip (the flag is NOT part of the serialized schema, so it must be
+    re-derived on deserialization).
+    """
+    model = get_spin_model(_spin_dpa1_config())
+    descriptor = model.backbone_model.get_dp_atomic_model().descriptor
+    # dpa1 with attn_layer=0 concat tebd would otherwise be graph-eligible.
+    assert descriptor._graph_lower_disabled is True
+    assert descriptor.uses_graph_lower() is False
+
+    # --- survive serialize -> deserialize ---
+    data = model.serialize()
+    from deepmd.dpmodel.model.spin_model import (
+        SpinModel,
+    )
+
+    model2 = SpinModel.deserialize(data)
+    descriptor2 = model2.backbone_model.get_dp_atomic_model().descriptor
+    assert descriptor2._graph_lower_disabled is True
+    assert descriptor2.uses_graph_lower() is False
+
+
 def test_spin_model_call_common_deterministic() -> None:
     """SpinModel.call_common is deterministic (no stochastic routing)."""
     rng = np.random.default_rng(7)

From 1b5c75d94f855f85aba886720c1ba77e42fce2f0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 5 Jul 2026 00:58:34 +0000
Subject: [PATCH 37/38] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 deepmd/dpmodel/utils/default_neighbor_list.py   |  4 +++-
 deepmd/dpmodel/utils/neighbor_list.py           |  4 +++-
 deepmd/dpmodel/utils/nlist.py                   |  4 +++-
 deepmd/pt/utils/nv_nlist.py                     |  4 +++-
 source/api_cc/include/DeepPotPTExpt.h           |  4 ++--
 source/api_cc/include/commonPT.h                | 14 ++++++++------
 source/api_cc/src/DeepPotPTExpt.cc              | 17 ++++++++---------
 .../tests/test_deeppot_dpa1_pairexcl_ptexpt.cc  |  3 ++-
 .../common/dpmodel/test_apply_pair_exclusion.py |  4 +++-
 .../dpmodel/test_apply_pair_exclusion_nlist.py  | 17 ++++++++++++-----
 .../common/dpmodel/test_graph_atomic_parity.py  |  6 +++---
 .../pt_expt/utils/test_vesin_graph_builder.py   |  4 +++-
 12 files changed, 53 insertions(+), 32 deletions(-)

diff --git a/deepmd/dpmodel/utils/default_neighbor_list.py b/deepmd/dpmodel/utils/default_neighbor_list.py
index c759e2f21d..d730b43669 100644
--- a/deepmd/dpmodel/utils/default_neighbor_list.py
+++ b/deepmd/dpmodel/utils/default_neighbor_list.py
@@ -26,7 +26,9 @@
 )
 
 if TYPE_CHECKING:
-    from deepmd.dpmodel.utils.exclude_mask import PairExcludeMask
+    from deepmd.dpmodel.utils.exclude_mask import (
+        PairExcludeMask,
+    )
 
 
 class DefaultNeighborList(NeighborList):
diff --git a/deepmd/dpmodel/utils/neighbor_list.py b/deepmd/dpmodel/utils/neighbor_list.py
index 947e8d2d33..c9b6f5923b 100644
--- a/deepmd/dpmodel/utils/neighbor_list.py
+++ b/deepmd/dpmodel/utils/neighbor_list.py
@@ -23,7 +23,9 @@
 )
 
 if TYPE_CHECKING:
-    from deepmd.dpmodel.utils.exclude_mask import PairExcludeMask
+    from deepmd.dpmodel.utils.exclude_mask import (
+        PairExcludeMask,
+    )
 
 
 @dataclass
diff --git a/deepmd/dpmodel/utils/nlist.py b/deepmd/dpmodel/utils/nlist.py
index d4330d21a9..3ee852e40f 100644
--- a/deepmd/dpmodel/utils/nlist.py
+++ b/deepmd/dpmodel/utils/nlist.py
@@ -19,7 +19,9 @@
 )
 
 if TYPE_CHECKING:
-    from deepmd.dpmodel.utils.exclude_mask import PairExcludeMask
+    from deepmd.dpmodel.utils.exclude_mask import (
+        PairExcludeMask,
+    )
 
 
 def _is_ndtensorflow_namespace(xp: Any) -> bool:
diff --git a/deepmd/pt/utils/nv_nlist.py b/deepmd/pt/utils/nv_nlist.py
index c6cb7c4ef8..df5f3dc119 100644
--- a/deepmd/pt/utils/nv_nlist.py
+++ b/deepmd/pt/utils/nv_nlist.py
@@ -51,7 +51,9 @@
         Iterator,
     )
 
-    from deepmd.dpmodel.utils.exclude_mask import PairExcludeMask
+    from deepmd.dpmodel.utils.exclude_mask import (
+        PairExcludeMask,
+    )
 
 
 @contextlib.contextmanager
diff --git a/source/api_cc/include/DeepPotPTExpt.h b/source/api_cc/include/DeepPotPTExpt.h
index 1c3400cc4b..6114dca340 100644
--- a/source/api_cc/include/DeepPotPTExpt.h
+++ b/source/api_cc/include/DeepPotPTExpt.h
@@ -331,8 +331,8 @@ class DeepPotPTExpt : public DeepPotBackend {
   // continue to work; GNN archives must be regenerated to opt into
   // the fail-fast guard against the silent-corruption bug.
   bool has_message_passing_ = false;
-  // Flat (ntypes+1)^2 model-level pair-type keep table, rebuilt in ``init`` from
-  // the ``pair_exclude_types`` metadata field (see
+  // Flat (ntypes+1)^2 model-level pair-type keep table, rebuilt in ``init``
+  // from the ``pair_exclude_types`` metadata field (see
   // ``deepmd::buildPairExcludeTable``).  Empty => no model-level exclusion.
   // Applied at the C++ ingestion seam (``applyPairExclusion`` graph /
   // ``applyPairExclusionNlist`` dense) as an idempotent backstop; the compiled
diff --git a/source/api_cc/include/commonPT.h b/source/api_cc/include/commonPT.h
index c379e9acc9..af0f37df20 100644
--- a/source/api_cc/include/commonPT.h
+++ b/source/api_cc/include/commonPT.h
@@ -469,9 +469,9 @@ inline GraphTensorPack buildGraphTensors(
  *
  * Inference-path mirror of the Python ``PairExcludeMask`` constructor
  * (``deepmd/dpmodel/utils/exclude_mask.py``).  The table is row-major over
- * ``[tj][ti]`` (flat index ``tj * (ntypes+1) + ti``); an entry is ``0`` when the
- * ordered pair ``(ti, tj)`` is excluded and ``1`` otherwise.  Both ``(ti, tj)``
- * and ``(tj, ti)`` are inserted into the exclude set, so the table is
+ * ``[tj][ti]`` (flat index ``tj * (ntypes+1) + ti``); an entry is ``0`` when
+ * the ordered pair ``(ti, tj)`` is excluded and ``1`` otherwise.  Both ``(ti,
+ * tj)`` and ``(tj, ti)`` are inserted into the exclude set, so the table is
  * symmetric.  Type ``ntypes`` is the reserved virtual-atom row/column.
  *
  * Returns an empty vector when ``exclude_types`` is empty, so callers can treat
@@ -556,9 +556,11 @@ inline torch::Tensor applyPairExclusion(const torch::Tensor& edge_index,
  * @brief Dense-nlist pair-type exclusion: erase excluded neighbours to ``-1``.
  *
  * Inference-path twin of Python ``apply_pair_exclusion_nlist`` in
- * ``deepmd/dpmodel/utils/nlist.py`` + ``PairExcludeMask.build_type_exclude_mask``.
- * Same argument order (nlist, atype_ext, ...) and same variable names
- * (``type_ij``, ``keep``).  Idempotent: erasing ``-1`` a second time is a no-op.
+ * ``deepmd/dpmodel/utils/nlist.py`` +
+ * ``PairExcludeMask.build_type_exclude_mask``. Same argument order (nlist,
+ * atype_ext, ...) and same variable names
+ * (``type_ij``, ``keep``).  Idempotent: erasing ``-1`` a second time is a
+ * no-op.
  *
  * @param nlist (nf, nloc, nnei) int64 neighbour list; ``-1`` == empty slot.
  * @param atype_ext (nf, nall) int64 extended atom types.
diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc
index ad09ae9365..1185dfba80 100644
--- a/source/api_cc/src/DeepPotPTExpt.cc
+++ b/source/api_cc/src/DeepPotPTExpt.cc
@@ -212,8 +212,8 @@ void DeepPotPTExpt::init(const std::string& model,
         pair_exclude_types.emplace_back(v[0].as_int(), v[1].as_int());
       }
     }
-    pair_exclude_table_ = deepmd::buildPairExcludeTable(ntypes,
-                                                        pair_exclude_types);
+    pair_exclude_table_ =
+        deepmd::buildPairExcludeTable(ntypes, pair_exclude_types);
   }
   if (has_comm_artifact_) {
     try {
@@ -892,10 +892,9 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
           atype_Tensor.slice(1, 0, n_node_count).reshape({n_node_count});
       // Model-level pair exclusion at the ingestion seam (idempotent backstop;
       // the compiled graph already applies the same transform internally).
-      const at::Tensor graph_edge_mask =
-          deepmd::applyPairExclusion(edge_tensors.edge_index,
-                                     edge_tensors.edge_mask, node_atype,
-                                     pair_exclude_table_, ntypes);
+      const at::Tensor graph_edge_mask = deepmd::applyPairExclusion(
+          edge_tensors.edge_index, edge_tensors.edge_mask, node_atype,
+          pair_exclude_table_, ntypes);
       flat_outputs =
           run_model_graph(node_atype, n_node_tensor, edge_tensors.edge_index,
                           edge_tensors.edge_vec, graph_edge_mask, fparam_tensor,
@@ -905,9 +904,9 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
       // backstop; the compiled dense forward already applies the same erase).
       const at::Tensor excl_nlist = deepmd::applyPairExclusionNlist(
           firstneigh_tensor, atype_Tensor, pair_exclude_table_, ntypes);
-      flat_outputs = run_model(coord_Tensor, atype_Tensor, excl_nlist,
-                               mapping_tensor, fparam_tensor, aparam_tensor,
-                               charge_spin_tensor);
+      flat_outputs =
+          run_model(coord_Tensor, atype_Tensor, excl_nlist, mapping_tensor,
+                    fparam_tensor, aparam_tensor, charge_spin_tensor);
     }
   }
 
diff --git a/source/api_cc/tests/test_deeppot_dpa1_pairexcl_ptexpt.cc b/source/api_cc/tests/test_deeppot_dpa1_pairexcl_ptexpt.cc
index 18a3016b03..a8f7e7daee 100644
--- a/source/api_cc/tests/test_deeppot_dpa1_pairexcl_ptexpt.cc
+++ b/source/api_cc/tests/test_deeppot_dpa1_pairexcl_ptexpt.cc
@@ -160,7 +160,8 @@ TYPED_TEST(TestInferDpa1PairExclPtExpt, graph_equals_nlist_route) {
 
 // The no-exclusion baseline exercises the EMPTY-table (identity) branch of the
 // C++ helpers; it must run cleanly and produce an energy that DIFFERS from the
-// excluded models (proving pair_exclude_types is genuinely active, not dropped).
+// excluded models (proving pair_exclude_types is genuinely active, not
+// dropped).
 TYPED_TEST(TestInferDpa1PairExclPtExpt, exclusion_is_active_vs_baseline) {
   using VALUETYPE = TypeParam;
   double e_none, e_g, e_n;
diff --git a/source/tests/common/dpmodel/test_apply_pair_exclusion.py b/source/tests/common/dpmodel/test_apply_pair_exclusion.py
index 2dad917120..d9ceb4a8f5 100644
--- a/source/tests/common/dpmodel/test_apply_pair_exclusion.py
+++ b/source/tests/common/dpmodel/test_apply_pair_exclusion.py
@@ -2,7 +2,9 @@
 import numpy as np
 import pytest
 
-from deepmd.dpmodel.utils.exclude_mask import PairExcludeMask
+from deepmd.dpmodel.utils.exclude_mask import (
+    PairExcludeMask,
+)
 from deepmd.dpmodel.utils.neighbor_graph import (
     NeighborGraph,
     apply_pair_exclusion,
diff --git a/source/tests/common/dpmodel/test_apply_pair_exclusion_nlist.py b/source/tests/common/dpmodel/test_apply_pair_exclusion_nlist.py
index fde86e7530..0d4f2c2304 100644
--- a/source/tests/common/dpmodel/test_apply_pair_exclusion_nlist.py
+++ b/source/tests/common/dpmodel/test_apply_pair_exclusion_nlist.py
@@ -7,13 +7,14 @@
 import numpy as np
 import pytest
 
-from deepmd.dpmodel.utils.exclude_mask import PairExcludeMask
+from deepmd.dpmodel.utils.exclude_mask import (
+    PairExcludeMask,
+)
 from deepmd.dpmodel.utils.nlist import (
     apply_pair_exclusion_nlist,
     build_neighbor_list,
 )
 
-
 # ---------------------------------------------------------------------------
 # Fixtures
 # ---------------------------------------------------------------------------
@@ -212,7 +213,9 @@ def _local_system():
 
 def test_default_neighbor_list_pair_excl_equals_seam() -> None:
     """DefaultNeighborList(pair_excl=excl) nlist equals build-then-apply."""
-    from deepmd.dpmodel.utils.default_neighbor_list import DefaultNeighborList
+    from deepmd.dpmodel.utils.default_neighbor_list import (
+        DefaultNeighborList,
+    )
 
     coord, atype = _local_system()
     rcut = 1.5
@@ -282,8 +285,12 @@ def test_nv_nlist_edges_pair_excl_raises():
     """
     import torch
 
-    from deepmd.dpmodel.utils.exclude_mask import PairExcludeMask
-    from deepmd.pt.utils.nv_nlist import NvNeighborList
+    from deepmd.dpmodel.utils.exclude_mask import (
+        PairExcludeMask,
+    )
+    from deepmd.pt.utils.nv_nlist import (
+        NvNeighborList,
+    )
 
     coord = torch.zeros((1, 4, 3), dtype=torch.float64)
     atype = torch.zeros((1, 4), dtype=torch.int64)
diff --git a/source/tests/common/dpmodel/test_graph_atomic_parity.py b/source/tests/common/dpmodel/test_graph_atomic_parity.py
index 2608c412c2..1332a36471 100644
--- a/source/tests/common/dpmodel/test_graph_atomic_parity.py
+++ b/source/tests/common/dpmodel/test_graph_atomic_parity.py
@@ -14,13 +14,13 @@
 from deepmd.dpmodel.model.ener_model import (
     EnergyModel,
 )
+from deepmd.dpmodel.utils.exclude_mask import (
+    PairExcludeMask,
+)
 from deepmd.dpmodel.utils.neighbor_graph import (
     apply_pair_exclusion,
     from_dense_quartet,
 )
-from deepmd.dpmodel.utils.exclude_mask import (
-    PairExcludeMask,
-)
 from deepmd.dpmodel.utils.nlist import (
     extend_input_and_build_neighbor_list,
 )
diff --git a/source/tests/pt_expt/utils/test_vesin_graph_builder.py b/source/tests/pt_expt/utils/test_vesin_graph_builder.py
index 41cf0245d3..c216676994 100644
--- a/source/tests/pt_expt/utils/test_vesin_graph_builder.py
+++ b/source/tests/pt_expt/utils/test_vesin_graph_builder.py
@@ -162,7 +162,9 @@ def test_vesin_pair_excl_oracle_set_equality(periodic):
 
 def test_vesin_nlist_edges_pair_excl_raises():
     """VesinNeighborList.build with return_mode='edges' and pair_excl raises NotImplementedError."""
-    from deepmd.pt_expt.utils.vesin_neighbor_list import VesinNeighborList
+    from deepmd.pt_expt.utils.vesin_neighbor_list import (
+        VesinNeighborList,
+    )
 
     coord = torch.zeros((1, 4, 3), dtype=torch.float64)
     atype = torch.zeros((1, 4), dtype=torch.int64)

From 58f771f888cfe102eb8737c7475ceb82f3a68e7f Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 5 Jul 2026 15:38:04 +0800
Subject: [PATCH 38/38] fix(docs+review): numpydoc See Also->Notes for C++
 cross-refs; address CodeQL/CodeRabbit

- RTD build failed: numpydoc's strict See Also parser rejected the C++-twin
  cross-reference prose entries ('Error parsing See Also entry ...'). Move the
  cross-refs to Notes sections (free-form reST) in apply_pair_exclusion and
  apply_pair_exclusion_nlist.
- main.py: drop stale 'no type exclusion' from the graph-eligibility comment
  (exclude_types is now graph-native; matches the docstring + ValueError).
- test_graph_atomic_parity: remove dead ds/ft/am chain (CodeQL unused 'am').
- test_neighbor_graph_builder: drop redundant local 'import unittest' (CodeQL).
---
 deepmd/dpmodel/utils/neighbor_graph/graph.py    | 17 +++++++++--------
 deepmd/dpmodel/utils/nlist.py                   | 17 +++++++++--------
 deepmd/pt_expt/entrypoints/main.py              |  2 +-
 .../common/dpmodel/test_graph_atomic_parity.py  |  3 ---
 .../dpmodel/test_neighbor_graph_builder.py      |  2 --
 5 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/deepmd/dpmodel/utils/neighbor_graph/graph.py b/deepmd/dpmodel/utils/neighbor_graph/graph.py
index 9586461312..338eb01182 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/graph.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/graph.py
@@ -211,14 +211,15 @@ def apply_pair_exclusion(
         A ``dataclasses.replace`` copy (or the original ``graph`` on early
         exit) with the exclusion applied.
 
-    See Also
-    --------
-    C++ twin ``applyPairExclusion`` in ``source/api_cc/include/commonPT.h``
-        The inference-path mirror. Same argument order (edge_index, edge_mask,
-        atype, ...), same variable names (``type_ij``, ``keep``): it computes
-        ``type_ij = atype[dst]*(ntypes+1) + atype[src]`` and ANDs the flat
-        ``(ntypes+1)^2`` table lookup into ``edge_mask`` (mask-only mode; no
-        compact variant on the compiled path).
+    Notes
+    -----
+    The C++ inference-path mirror is ``applyPairExclusion`` in
+    ``source/api_cc/include/commonPT.h``. It uses the same argument order
+    (edge_index, edge_mask, atype, ...) and the same variable names
+    (``type_ij``, ``keep``): it computes
+    ``type_ij = atype[dst]*(ntypes+1) + atype[src]`` and ANDs the flat
+    ``(ntypes+1)^2`` table lookup into ``edge_mask`` (mask-only mode; no
+    compact variant on the compiled path).
     """
     import dataclasses
 
diff --git a/deepmd/dpmodel/utils/nlist.py b/deepmd/dpmodel/utils/nlist.py
index 3ee852e40f..8516409c1b 100644
--- a/deepmd/dpmodel/utils/nlist.py
+++ b/deepmd/dpmodel/utils/nlist.py
@@ -92,14 +92,6 @@ def apply_pair_exclusion_nlist(
     This is the nlist-representation counterpart of
     :func:`deepmd.dpmodel.utils.neighbor_graph.apply_pair_exclusion`.
 
-    See Also
-    --------
-    C++ twin ``applyPairExclusionNlist`` in ``source/api_cc/include/commonPT.h``
-        The inference-path mirror. Same argument order (nlist, atype_ext, ...),
-        same variable names (``type_ij``, ``keep``): it computes ``type_ij``
-        from the center/neighbor types via the flat ``(ntypes+1)^2`` table and
-        replaces excluded entries with ``-1``.
-
     Parameters
     ----------
     nlist : Array
@@ -115,6 +107,15 @@ def apply_pair_exclusion_nlist(
     Array
         Neighbor list of the same shape with excluded entries set to ``-1``.
         Erasing ``-1`` entries a second time is a no-op (idempotent).
+
+    Notes
+    -----
+    The C++ inference-path mirror is ``applyPairExclusionNlist`` in
+    ``source/api_cc/include/commonPT.h``. It uses the same argument order
+    (nlist, atype_ext, ...) and the same variable names (``type_ij``,
+    ``keep``): it computes ``type_ij`` from the center/neighbor types via
+    the flat ``(ntypes+1)^2`` table and replaces excluded entries with
+    ``-1``.
     """
     if pair_excl is None or len(pair_excl.exclude_types) == 0:
         return nlist
diff --git a/deepmd/pt_expt/entrypoints/main.py b/deepmd/pt_expt/entrypoints/main.py
index f74c911ff5..2e4f747ccb 100644
--- a/deepmd/pt_expt/entrypoints/main.py
+++ b/deepmd/pt_expt/entrypoints/main.py
@@ -569,7 +569,7 @@ def freeze(
     m.eval()
 
     # The graph lower is opt-in and only valid for graph-eligible models
-    # (dpa1 with concat tebd and no type exclusion; attention layers included
+    # (dpa1 with concat tebd, incl. attention layers and exclude_types
     # -- the carry-all pair enumeration exports via unbacked SymInts). Fail
     # fast with a clear message rather than emitting a broken .pt2. Enable the
     # per-atom virial for the graph form -- it is near-free there (one extra
diff --git a/source/tests/common/dpmodel/test_graph_atomic_parity.py b/source/tests/common/dpmodel/test_graph_atomic_parity.py
index 1332a36471..64bfe82f78 100644
--- a/source/tests/common/dpmodel/test_graph_atomic_parity.py
+++ b/source/tests/common/dpmodel/test_graph_atomic_parity.py
@@ -326,9 +326,6 @@ def test_apply_pair_exclusion_idempotent(pair_exclude_types):
     rng = np.random.default_rng(42)
     coord = rng.normal(size=(1, 5, 3)) * 1.5
     atype = np.array([[0, 1, 0, 1, 0]], dtype=np.int64)
-    ds = DescrptDPA1(rcut=4.0, rcut_smth=0.5, sel=[200], ntypes=2, attn_layer=0)
-    ft = InvarFitting("energy", 2, ds.get_dim_out(), 1, mixed_types=True)
-    am = DPAtomicModel(ds, ft, type_map=["a", "b"])
     ext_coord, ext_atype, mapping, nlist = extend_input_and_build_neighbor_list(
         coord, atype, 4.0, [200], mixed_types=True, box=None
     )
diff --git a/source/tests/common/dpmodel/test_neighbor_graph_builder.py b/source/tests/common/dpmodel/test_neighbor_graph_builder.py
index 30e1516a75..8f91f5af8c 100644
--- a/source/tests/common/dpmodel/test_neighbor_graph_builder.py
+++ b/source/tests/common/dpmodel/test_neighbor_graph_builder.py
@@ -421,8 +421,6 @@ def setUpClass(cls) -> None:
         try:
             import ase  # noqa: F401
         except ImportError as e:
-            import unittest
-
             raise unittest.SkipTest("ase not installed") from e
 
     def setUp(self) -> None: