alibaba · shun001 · Jun 26, 2026
diff --git a/.github/workflows/ci-npu-mindspeed.yml b/.github/workflows/ci-npu-mindspeed.yml
@@ -0,0 +1,187 @@
+name: MindSpeed NPU Tests
+
+on:
+  workflow_dispatch:
+    inputs:
+      mindspeed_repo:
+        description: "MindSpeed git repository"
+        required: false
+        default: "https://github.com/ascend/MindSpeed.git"
+      mindspeed_ref:
+        description: "MindSpeed branch, tag, or ref to install"
+        required: false
+        default: "core_r0.16.0"
+  push:
+    branches: [main, npu_ci_all]
+    paths:
+      - ".github/workflows/ci-npu-mindspeed.yml"
+      - "mcore_adapter/**"
+      - "roll/third_party/megatron/**"
+      - "tests/third_party/megatron/**"
+      - "requirements_common.txt"
+      - "requirements_vision.txt"
+      - "setup.py"
+      - "pyproject.toml"
+  pull_request:
+    branches: [main, npu_ci_all]
+    paths:
+      - ".github/workflows/ci-npu-mindspeed.yml"
+      - "mcore_adapter/**"
+      - "roll/third_party/megatron/**"
+      - "tests/third_party/megatron/**"
+      - "requirements_common.txt"
+      - "requirements_vision.txt"
+      - "setup.py"
+      - "pyproject.toml"
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  mindspeed-npu-test:
+    name: MindSpeed 0.16 Core NPU Tests
+    if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository
+    runs-on: linux-aarch64-a3-8
+    timeout-minutes: 90
+    container:
+      image: quay.io/ascend/vllm-ascend:v0.18.0-a3
+    env:
+      PIP_CACHE_DIR: ${{ github.workspace }}/.pip-cache
+      PIP_INDEX_URL: https://repo.huaweicloud.com/repository/pypi/simple
+      PIP_TRUSTED_HOST: repo.huaweicloud.com
+      HF_ENDPOINT: https://hf-mirror.com
+      PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
+      TASK_QUEUE_ENABLE: "2"
+      VLLM_USE_V1: "1"
+      VLLM_ASCEND_ENABLE_FLASHCOMM: "0"
+      VLLM_ASCEND_ENABLE_NZ: "0"
+      MINDSPEED_REPO: ${{ github.event.inputs.mindspeed_repo || 'https://github.com/ascend/MindSpeed.git' }}
+      MINDSPEED_REF: ${{ github.event.inputs.mindspeed_ref || 'core_r0.16.0' }}
+      MINDSPEED_CACHE_KEY: "core-r0.16.0"
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Cache NPU pip packages
+        uses: actions/cache@v4
+        with:
+          path: .pip-cache
+          key: ${{ runner.os }}-npu-mindspeed-${{ env.MINDSPEED_CACHE_KEY }}-${{ hashFiles('requirements_common.txt', 'requirements_vision.txt', 'mcore_adapter/pyproject.toml', 'mcore_adapter/requirements.txt', 'setup.py', 'pyproject.toml', '.github/workflows/ci-npu-mindspeed.yml') }}
+          restore-keys: |
+            ${{ runner.os }}-npu-mindspeed-${{ env.MINDSPEED_CACHE_KEY }}-
+            ${{ runner.os }}-npu-mindspeed-
+            ${{ runner.os }}-npu-pip-
+
+      - name: Configure pip cache
+        run: |
+          mkdir -p "$PIP_CACHE_DIR"
+
+      - name: Configure Ascend runtime
+        shell: bash
+        run: |
+          for env_file in \
+            /usr/local/Ascend/ascend-toolkit/set_env.sh \
+            /usr/local/Ascend/nnal/atb/set_env.sh; do
+            [ -f "${env_file}" ] && source "${env_file}"
+          done
+
+          ASCEND_HOME_PATH="${ASCEND_HOME_PATH:-/usr/local/Ascend/ascend-toolkit/latest}"
+          ASCEND_TOOLKIT_HOME="${ASCEND_TOOLKIT_HOME:-${ASCEND_HOME_PATH}}"
+          ASCEND_OPP_PATH="${ASCEND_OPP_PATH:-${ASCEND_HOME_PATH}/opp}"
+          ASCEND_AICPU_PATH="${ASCEND_AICPU_PATH:-${ASCEND_HOME_PATH}}"
+          LD_LIBRARY_PATH="${ASCEND_HOME_PATH}/lib64:${ASCEND_HOME_PATH}/runtime/lib64:${ASCEND_HOME_PATH}/runtime/lib64/stub:${ASCEND_HOME_PATH}/tools/hccl/lib64:${ASCEND_HOME_PATH}/hccl/lib64:${LD_LIBRARY_PATH:-}"
+
+          for path in \
+            "${ASCEND_OPP_PATH}/built-in/op_impl/ai_core/tbe" \
+            "${ASCEND_HOME_PATH}/python/site-packages"; do
+            [ -d "${path}" ] && PYTHONPATH="${path}:${PYTHONPATH:-}"
+          done
+
+          {
+            echo "ASCEND_HOME_PATH=${ASCEND_HOME_PATH}"
+            echo "ASCEND_TOOLKIT_HOME=${ASCEND_TOOLKIT_HOME}"
+            echo "ASCEND_OPP_PATH=${ASCEND_OPP_PATH}"
+            echo "ASCEND_AICPU_PATH=${ASCEND_AICPU_PATH}"
+            echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}"
+            echo "PYTHONPATH=${PYTHONPATH:-}"
+          } >> "${GITHUB_ENV}"
+          {
+            echo "${ASCEND_HOME_PATH}/bin"
+            echo "${ASCEND_HOME_PATH}/compiler/ccec_compiler/bin"
+          } >> "${GITHUB_PATH}"
+
+      - name: Check NPU environment
+        run: |
+          python3 - <<'PY'
+          import importlib.util
+
+          import torch
+          import torch_npu
+
+          if importlib.util.find_spec("tbe") is None:
+              raise RuntimeError("CANN tbe Python module is not visible in PYTHONPATH")
+          if not torch.npu.is_available():
+              raise RuntimeError("torch.npu.is_available() is False")
+          print(f"npu_device_count={torch.npu.device_count()}")
+          PY
+
+      - name: Install ROLL requirements
+        shell: bash
+        run: |
+          python3 -m pip install --upgrade pip wheel
+          # torchair still imports pkg_resources; setuptools 82 removed it.
+          python3 -m pip install "setuptools<82"
+          python3 -m pip install --retries 10 --timeout 120 pytest-timeout
+          python3 -m pip install --retries 10 --timeout 120 -r requirements_common.txt
+          python3 -m pip install --retries 10 --timeout 120 deepspeed==0.16.4 tensorboard
+          python3 -m pip install "setuptools<82"
+          python3 -c "import pkg_resources"
+
+      - name: Install MindSpeed core_r0.16.0
+        shell: bash
+        run: |
+          set -eo pipefail
+          export MINDSPEED_SRC="/tmp/MindSpeed"
+          rm -rf "${MINDSPEED_SRC}"
+          git clone --depth 1 --branch "${MINDSPEED_REF}" "${MINDSPEED_REPO}" "${MINDSPEED_SRC}"
+          cd "${MINDSPEED_SRC}"
+          python3 -m pip install --no-build-isolation --no-deps -e .
+
+      - name: Install ROLL
+        run: |
+          python3 -m pip install -e .
+
+      - name: Prepare Megatron test model
+        shell: bash
+        run: |
+          set -eo pipefail
+          local_model="/data/cpfs_0/common/models/Qwen2.5-0.5B-Instruct"
+          if [ -d "${local_model}" ]; then
+            echo "ROLL_MEGATRON_TEST_MODEL=${local_model}" >> "${GITHUB_ENV}"
+            exit 0
+          fi
+
+          python3 - <<'PY'
+          import os
+          from huggingface_hub import snapshot_download
+
+          model_path = snapshot_download("Qwen/Qwen2.5-0.5B-Instruct")
+          with open(os.environ["GITHUB_ENV"], "a", encoding="utf-8") as env_file:
+              env_file.write(f"ROLL_MEGATRON_TEST_MODEL={model_path}\n")
+          PY
+
+      - name: Run MindSpeed Megatron offload tests
+        shell: bash
+        run: |
+          export PYTHONPATH="${GITHUB_WORKSPACE}/mcore_adapter/src:${GITHUB_WORKSPACE}:${PYTHONPATH:-}"
+          torchrun --standalone --nnodes=1 --nproc-per-node=2 \
+            -m pytest -q --tb=short tests/third_party/megatron/test_offload_states.py
+        env:
+          ROLL_NPU_CI: "1"
diff --git a/examples/ascend_examples/qwen3_4B_dpo_megatron.yaml b/examples/ascend_examples/qwen3_4B_dpo_megatron.yaml
@@ -0,0 +1,100 @@
+defaults:
+  - ../config/deepspeed_zero@_here_
+  - ../config/deepspeed_zero2@_here_
+  - ../config/deepspeed_zero3@_here_
+  - ../config/deepspeed_zero3_cpuoffload@_here_
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+exp_name: "qwen3-4B-dpo-config"
+seed: 42
+logging_dir: ./output/logs
+output_dir: ./output
+system_envs:
+  USE_MODELSCOPE: '1'
+
+checkpoint_config:
+  type: file_system
+  output_dir: ./ckpt
+
+
+track_name: None
+
+
+max_steps: 500
+save_steps: 500
+logging_steps: 1
+eval_steps: 100
+resume_from_checkpoint: false
+
+sequence_length: 512
+train_batch_size: 64
+val_batch_size: 64
+
+# local_rank: -1
+num_nodes: 1
+num_gpus_per_node: 4
+
+pretrain: Qwen/Qwen3-4B
+
+ipo: false
+beta: 0.1
+label_smoothing: 0.0
+
+chosen_key: chosen
+rejected_key: rejected
+
+validation:
+  data_args:
+    template: qwen3
+    file_name: data/comparison_gpt4_data_zh.json
+
+actor_train:
+  model_args:
+    disable_gradient_checkpointing: false
+    dtype: bf16
+    model_type: ~
+  training_args:
+    lr_scheduler_type: constant
+    learning_rate: 1.0e-6
+    weight_decay: 0
+    per_device_train_batch_size: 16
+    gradient_accumulation_steps: 1
+    warmup_steps: 20
+    num_train_epochs: 10
+  data_args:
+    template: qwen3
+    file_name:
+      - data/comparison_gpt4_data_zh.json
+    dataset_dir: data
+    preprocessing_num_workers: 1
+  strategy_args:
+    strategy_name: megatron_train
+    strategy_config:
+      tensor_model_parallel_size: 1
+      pipeline_model_parallel_size: 1
+      expert_model_parallel_size: 1
+      use_distributed_optimizer: true
+      recompute_granularity: full
+  device_mapping: list(range(0,2))
+  infer_batch_size: 16
+
+
+reference:
+  model_args:
+    disable_gradient_checkpointing: true
+    dtype: bf16
+    model_type: ~
+  data_args:
+    template: qwen3
+  strategy_args:
+    strategy_name: megatron_infer
+    strategy_config:
+      tensor_model_parallel_size: 1
+      pipeline_model_parallel_size: 1
+      expert_model_parallel_size: 1
+  device_mapping: list(range(2,4))
+  infer_batch_size: 16
diff --git a/mcore_adapter/src/mcore_adapter/initialize.py b/mcore_adapter/src/mcore_adapter/initialize.py
@@ -1,18 +1,80 @@
 import os
 import random
+import sys
+from typing import TYPE_CHECKING
 
 import numpy as np
 import torch
 from megatron.core import mpu, tensor_parallel
 
 from .platforms import current_platform
-from .training_args import TrainingArguments
 from .utils import get_logger
 
+if TYPE_CHECKING:
+    from .training_args import TrainingArguments
+
 
 logger = get_logger(__name__)
 
 
+_NPU_RUNTIME_BOOTSTRAPPED = False
+
+
+def bootstrap_npu_runtime():
+    global _NPU_RUNTIME_BOOTSTRAPPED
+
+    if _NPU_RUNTIME_BOOTSTRAPPED or not current_platform.is_npu():
+        return
+
+    import torch_npu  # noqa: F401
+
+    try:
+        import mindspeed.megatron_adaptor  # noqa: F401
+    except ImportError:
+        pass
+
+    import megatron.core.tensor_parallel.random as meg_random
+
+    if not hasattr(meg_random, "_npu_patched"):
+        meg_random.initialize_rng_tracker()
+
+        def patched_set(new_state, device=-1, graph_safe=False):
+            torch.npu.set_rng_state(new_state)
+            return
+
+        def patched_get(device="npu", clone=False, graph_safe=False):
+            return torch.npu.get_rng_state()
+
+        meg_random._set_cuda_rng_state = patched_set
+        meg_random._get_cuda_rng_state = patched_get
+
+        rng_state = torch.npu.get_rng_state()
+        meg_random._CUDA_RNG_STATE_TRACKER.states_["model-parallel-rng"] = rng_state
+        meg_random._CUDA_RNG_STATE_TRACKER.states_["data-parallel-rng"] = rng_state
+
+        meg_random._npu_patched = True
+
+    if not hasattr(torch.cuda, "_npu_patched"):
+        torch.cuda.current_device = lambda: torch.npu.current_device()
+        torch.cuda._npu_patched = True
+
+    _NPU_RUNTIME_BOOTSTRAPPED = True
+
+
+def apply_mindspeed_feature_defaults(config):
+    if "mindspeed.megatron_adaptor" not in sys.modules:
+        return
+
+    try:
+        from mindspeed.args_utils import get_mindspeed_args
+    except ImportError:
+        return
+
+    for name, value in vars(get_mindspeed_args(get_defaults=True)).items():
+        if not hasattr(config, name):
+            setattr(config, name, value)
+
+
 def is_distribute_initialized():
     return mpu.model_parallel_is_initialized()
 
@@ -29,13 +91,14 @@ def _set_random_seed(seed_):
         random.seed(seed)
         np.random.seed(seed)
         torch.manual_seed(seed)
-        if current_platform.device_count() > 0:
+        if current_platform.is_cuda() and current_platform.device_count() > 0:
             tensor_parallel.model_parallel_cuda_manual_seed(seed)
     else:
         raise ValueError("Seed ({}) should be a positive integer.".format(seed))
 
 
 def initialize_megatron(args: "TrainingArguments"):
+    bootstrap_npu_runtime()
     if not is_distribute_initialized():
         _initialize_distributed(args)
     _set_random_seed(args.seed)