Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 187 additions & 0 deletions .github/workflows/ci-npu-mindspeed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
name: MindSpeed NPU Tests

on:
workflow_dispatch:
inputs:
mindspeed_repo:
description: "MindSpeed git repository"
required: false
default: "https://github.com/ascend/MindSpeed.git"
mindspeed_ref:
description: "MindSpeed branch, tag, or ref to install"
required: false
default: "core_r0.16.0"
push:
branches: [main, npu_ci_all]
paths:
- ".github/workflows/ci-npu-mindspeed.yml"
- "mcore_adapter/**"
- "roll/third_party/megatron/**"
- "tests/third_party/megatron/**"
- "requirements_common.txt"
- "requirements_vision.txt"
- "setup.py"
- "pyproject.toml"
pull_request:
branches: [main, npu_ci_all]
paths:
- ".github/workflows/ci-npu-mindspeed.yml"
- "mcore_adapter/**"
- "roll/third_party/megatron/**"
- "tests/third_party/megatron/**"
- "requirements_common.txt"
- "requirements_vision.txt"
- "setup.py"
- "pyproject.toml"

permissions:
contents: read

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
mindspeed-npu-test:
name: MindSpeed 0.16 Core NPU Tests
if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository
runs-on: linux-aarch64-a3-8
timeout-minutes: 90
container:
image: quay.io/ascend/vllm-ascend:v0.18.0-a3
env:
PIP_CACHE_DIR: ${{ github.workspace }}/.pip-cache
PIP_INDEX_URL: https://repo.huaweicloud.com/repository/pypi/simple
PIP_TRUSTED_HOST: repo.huaweicloud.com
HF_ENDPOINT: https://hf-mirror.com
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
TASK_QUEUE_ENABLE: "2"
VLLM_USE_V1: "1"
VLLM_ASCEND_ENABLE_FLASHCOMM: "0"
VLLM_ASCEND_ENABLE_NZ: "0"
MINDSPEED_REPO: ${{ github.event.inputs.mindspeed_repo || 'https://github.com/ascend/MindSpeed.git' }}
MINDSPEED_REF: ${{ github.event.inputs.mindspeed_ref || 'core_r0.16.0' }}
MINDSPEED_CACHE_KEY: "core-r0.16.0"

steps:
- name: Checkout code
uses: actions/checkout@v4
with:
submodules: recursive

- name: Cache NPU pip packages
uses: actions/cache@v4
with:
path: .pip-cache
key: ${{ runner.os }}-npu-mindspeed-${{ env.MINDSPEED_CACHE_KEY }}-${{ hashFiles('requirements_common.txt', 'requirements_vision.txt', 'mcore_adapter/pyproject.toml', 'mcore_adapter/requirements.txt', 'setup.py', 'pyproject.toml', '.github/workflows/ci-npu-mindspeed.yml') }}
restore-keys: |
${{ runner.os }}-npu-mindspeed-${{ env.MINDSPEED_CACHE_KEY }}-
${{ runner.os }}-npu-mindspeed-
${{ runner.os }}-npu-pip-

- name: Configure pip cache
run: |
mkdir -p "$PIP_CACHE_DIR"

- name: Configure Ascend runtime
shell: bash
run: |
for env_file in \
/usr/local/Ascend/ascend-toolkit/set_env.sh \
/usr/local/Ascend/nnal/atb/set_env.sh; do
[ -f "${env_file}" ] && source "${env_file}"
done

ASCEND_HOME_PATH="${ASCEND_HOME_PATH:-/usr/local/Ascend/ascend-toolkit/latest}"
ASCEND_TOOLKIT_HOME="${ASCEND_TOOLKIT_HOME:-${ASCEND_HOME_PATH}}"
ASCEND_OPP_PATH="${ASCEND_OPP_PATH:-${ASCEND_HOME_PATH}/opp}"
ASCEND_AICPU_PATH="${ASCEND_AICPU_PATH:-${ASCEND_HOME_PATH}}"
LD_LIBRARY_PATH="${ASCEND_HOME_PATH}/lib64:${ASCEND_HOME_PATH}/runtime/lib64:${ASCEND_HOME_PATH}/runtime/lib64/stub:${ASCEND_HOME_PATH}/tools/hccl/lib64:${ASCEND_HOME_PATH}/hccl/lib64:${LD_LIBRARY_PATH:-}"

for path in \
"${ASCEND_OPP_PATH}/built-in/op_impl/ai_core/tbe" \
"${ASCEND_HOME_PATH}/python/site-packages"; do
[ -d "${path}" ] && PYTHONPATH="${path}:${PYTHONPATH:-}"
done

{
echo "ASCEND_HOME_PATH=${ASCEND_HOME_PATH}"
echo "ASCEND_TOOLKIT_HOME=${ASCEND_TOOLKIT_HOME}"
echo "ASCEND_OPP_PATH=${ASCEND_OPP_PATH}"
echo "ASCEND_AICPU_PATH=${ASCEND_AICPU_PATH}"
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}"
echo "PYTHONPATH=${PYTHONPATH:-}"
} >> "${GITHUB_ENV}"
{
echo "${ASCEND_HOME_PATH}/bin"
echo "${ASCEND_HOME_PATH}/compiler/ccec_compiler/bin"
} >> "${GITHUB_PATH}"

- name: Check NPU environment
run: |
python3 - <<'PY'
import importlib.util

import torch
import torch_npu

if importlib.util.find_spec("tbe") is None:
raise RuntimeError("CANN tbe Python module is not visible in PYTHONPATH")
if not torch.npu.is_available():
raise RuntimeError("torch.npu.is_available() is False")
print(f"npu_device_count={torch.npu.device_count()}")
PY

- name: Install ROLL requirements
shell: bash
run: |
python3 -m pip install --upgrade pip wheel
# torchair still imports pkg_resources; setuptools 82 removed it.
python3 -m pip install "setuptools<82"
python3 -m pip install --retries 10 --timeout 120 pytest-timeout
python3 -m pip install --retries 10 --timeout 120 -r requirements_common.txt
python3 -m pip install --retries 10 --timeout 120 deepspeed==0.16.4 tensorboard
python3 -m pip install "setuptools<82"
python3 -c "import pkg_resources"

- name: Install MindSpeed core_r0.16.0
shell: bash
run: |
set -eo pipefail
export MINDSPEED_SRC="/tmp/MindSpeed"
rm -rf "${MINDSPEED_SRC}"
git clone --depth 1 --branch "${MINDSPEED_REF}" "${MINDSPEED_REPO}" "${MINDSPEED_SRC}"
cd "${MINDSPEED_SRC}"
python3 -m pip install --no-build-isolation --no-deps -e .

- name: Install ROLL
run: |
python3 -m pip install -e .

- name: Prepare Megatron test model
shell: bash
run: |
set -eo pipefail
local_model="/data/cpfs_0/common/models/Qwen2.5-0.5B-Instruct"
if [ -d "${local_model}" ]; then
echo "ROLL_MEGATRON_TEST_MODEL=${local_model}" >> "${GITHUB_ENV}"
exit 0
fi

python3 - <<'PY'
import os
from huggingface_hub import snapshot_download

model_path = snapshot_download("Qwen/Qwen2.5-0.5B-Instruct")
with open(os.environ["GITHUB_ENV"], "a", encoding="utf-8") as env_file:
env_file.write(f"ROLL_MEGATRON_TEST_MODEL={model_path}\n")
PY

- name: Run MindSpeed Megatron offload tests
shell: bash
run: |
export PYTHONPATH="${GITHUB_WORKSPACE}/mcore_adapter/src:${GITHUB_WORKSPACE}:${PYTHONPATH:-}"
torchrun --standalone --nnodes=1 --nproc-per-node=2 \
-m pytest -q --tb=short tests/third_party/megatron/test_offload_states.py
env:
ROLL_NPU_CI: "1"
100 changes: 100 additions & 0 deletions examples/ascend_examples/qwen3_4B_dpo_megatron.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
defaults:
- ../config/deepspeed_zero@_here_
- ../config/deepspeed_zero2@_here_
- ../config/deepspeed_zero3@_here_
- ../config/deepspeed_zero3_cpuoffload@_here_

hydra:
run:
dir: .
output_subdir: null

exp_name: "qwen3-4B-dpo-config"
seed: 42
logging_dir: ./output/logs
output_dir: ./output
system_envs:
USE_MODELSCOPE: '1'

checkpoint_config:
type: file_system
output_dir: ./ckpt


track_name: None


max_steps: 500
save_steps: 500
logging_steps: 1
eval_steps: 100
resume_from_checkpoint: false

sequence_length: 512
train_batch_size: 64
val_batch_size: 64

# local_rank: -1
num_nodes: 1
num_gpus_per_node: 4

pretrain: Qwen/Qwen3-4B

ipo: false
beta: 0.1
label_smoothing: 0.0

chosen_key: chosen
rejected_key: rejected

validation:
data_args:
template: qwen3
file_name: data/comparison_gpt4_data_zh.json

actor_train:
model_args:
disable_gradient_checkpointing: false
dtype: bf16
model_type: ~
training_args:
lr_scheduler_type: constant
learning_rate: 1.0e-6
weight_decay: 0
per_device_train_batch_size: 16
gradient_accumulation_steps: 1
warmup_steps: 20
num_train_epochs: 10
data_args:
template: qwen3
file_name:
- data/comparison_gpt4_data_zh.json
dataset_dir: data
preprocessing_num_workers: 1
strategy_args:
strategy_name: megatron_train
strategy_config:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
use_distributed_optimizer: true
recompute_granularity: full
device_mapping: list(range(0,2))
infer_batch_size: 16


reference:
model_args:
disable_gradient_checkpointing: true
dtype: bf16
model_type: ~
data_args:
template: qwen3
strategy_args:
strategy_name: megatron_infer
strategy_config:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
device_mapping: list(range(2,4))
infer_batch_size: 16
67 changes: 65 additions & 2 deletions mcore_adapter/src/mcore_adapter/initialize.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,80 @@
import os
import random
import sys
from typing import TYPE_CHECKING

import numpy as np
import torch
from megatron.core import mpu, tensor_parallel

from .platforms import current_platform
from .training_args import TrainingArguments
from .utils import get_logger

if TYPE_CHECKING:
from .training_args import TrainingArguments


logger = get_logger(__name__)


_NPU_RUNTIME_BOOTSTRAPPED = False


def bootstrap_npu_runtime():
global _NPU_RUNTIME_BOOTSTRAPPED

if _NPU_RUNTIME_BOOTSTRAPPED or not current_platform.is_npu():
return

import torch_npu # noqa: F401

try:
import mindspeed.megatron_adaptor # noqa: F401
except ImportError:
pass

import megatron.core.tensor_parallel.random as meg_random

if not hasattr(meg_random, "_npu_patched"):
meg_random.initialize_rng_tracker()

def patched_set(new_state, device=-1, graph_safe=False):
torch.npu.set_rng_state(new_state)
return

def patched_get(device="npu", clone=False, graph_safe=False):
return torch.npu.get_rng_state()

meg_random._set_cuda_rng_state = patched_set
meg_random._get_cuda_rng_state = patched_get

rng_state = torch.npu.get_rng_state()
meg_random._CUDA_RNG_STATE_TRACKER.states_["model-parallel-rng"] = rng_state
meg_random._CUDA_RNG_STATE_TRACKER.states_["data-parallel-rng"] = rng_state

meg_random._npu_patched = True

if not hasattr(torch.cuda, "_npu_patched"):
torch.cuda.current_device = lambda: torch.npu.current_device()
torch.cuda._npu_patched = True

_NPU_RUNTIME_BOOTSTRAPPED = True


def apply_mindspeed_feature_defaults(config):
if "mindspeed.megatron_adaptor" not in sys.modules:
return

try:
from mindspeed.args_utils import get_mindspeed_args
except ImportError:
return

for name, value in vars(get_mindspeed_args(get_defaults=True)).items():
if not hasattr(config, name):
setattr(config, name, value)


def is_distribute_initialized():
return mpu.model_parallel_is_initialized()

Expand All @@ -29,13 +91,14 @@ def _set_random_seed(seed_):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if current_platform.device_count() > 0:
if current_platform.is_cuda() and current_platform.device_count() > 0:
tensor_parallel.model_parallel_cuda_manual_seed(seed)
else:
raise ValueError("Seed ({}) should be a positive integer.".format(seed))


def initialize_megatron(args: "TrainingArguments"):
bootstrap_npu_runtime()
if not is_distribute_initialized():
_initialize_distributed(args)
_set_random_seed(args.seed)
Expand Down
Loading
Loading