diff --git a/upstream-buildbots/README.md b/upstream-buildbots/README.md index 3f3ce9a01..3c23aa738 100644 --- a/upstream-buildbots/README.md +++ b/upstream-buildbots/README.md @@ -5,6 +5,35 @@ These files can be used to recreate the docker container images and allow a deve During container build time certain ROCm components are pulled-in. Depending on the respective container / OS, this may result in a large container image. +## ROCm installation (manylinux images) + +The `manylinux-build-only` and `manylinux-hip-tpl` images install ROCm from a +[TheRock](https://github.com/ROCm/TheRock) nightly "dist" tarball rather than +from packages. +The full dist tarball contains the complete ROCm SDK (HIP runtime, device libs, +`rocminfo`, `rocblas`, `rocthrust`, ...) and is extracted into `/opt/rocm` by the +`install-rocm-nightly.sh` helper script in each image directory. + +By default the most recent nightly matching the configured base version and gfx +target is installed. The behavior is controlled via build args: + +| Build arg | Default | Meaning | +| ------------------- | -------- | ----------------------------------------------------------------------------------------------- | +| `ROCM_BASE_VERSION` | `7.14` | Base ROCm version to track. | +| `ROCM_GFX` | `gfx90a` | gfx target family. | +| `ROCM_NIGHTLY_DATE` | (empty) | Pin a specific build date `YYYYMMDD`; empty auto-detects the latest. Must be 8 digits when set. | + +To pin a reproducible build, pass the date at build time: + +``` +sudo docker build --build-arg ROCM_NIGHTLY_DATE=20260610 -t -f Dockerfile . +``` + +The helper records what was actually installed in `/opt/rocm/.info/nightly` +(tarball name, URL, base version, gfx target, and date), complementing TheRock's +own `/opt/rocm/.info/version`, which only carries the base version and cannot +distinguish nightlies. + We build the containers with a docker invocation adjacent to ``` @@ -20,6 +49,7 @@ sudo docker run --rm -it --network=host --device=/dev/kfd --device=/dev/dri --gr ``` ## Assumptions / Requirements + - The images require a working AMDGPU dkms / KFD to be installed in order to test work on the GPU. - The images assume a group id for the `render` group of `109`. - This is currently hardcoded in the Dockerfile. diff --git a/upstream-buildbots/manylinux-build-only/Dockerfile b/upstream-buildbots/manylinux-build-only/Dockerfile index e52c1e33d..7660bedcb 100644 --- a/upstream-buildbots/manylinux-build-only/Dockerfile +++ b/upstream-buildbots/manylinux-build-only/Dockerfile @@ -4,18 +4,19 @@ # https://github.com/ROCm/TheRock/blob/main/dockerfiles/build_manylinux_x86_64.Dockerfile FROM localhost/manylinux:base -# Add ROCm repository. -COPY rocm.repo /etc/yum.repos.d/rocm.repo -RUN yum clean packages && yum clean all +# Install ROCm from a TheRock nightly dist tarball. +# By default the most recent nightly for the given base version + gfx is used; +# pin a specific build with --build-arg ROCM_NIGHTLY_DATE=YYYYMMDD. +ARG ROCM_BASE_VERSION=7.14 +ARG ROCM_GFX=gfx90a +ARG ROCM_NIGHTLY_DATE= -# Install minimal ROCm components for buildbot. -RUN yum install -y \ - rocm-device-libs \ - rocm-core \ - rocminfo \ - hsa-rocr-devel7.1.1 \ - && yum clean all && \ - rm -rf /var/cache/yum +COPY install-rocm-nightly.sh /tmp/install-rocm-nightly.sh +RUN ROCM_BASE_VERSION="${ROCM_BASE_VERSION}" \ + ROCM_GFX="${ROCM_GFX}" \ + ROCM_NIGHTLY_DATE="${ROCM_NIGHTLY_DATE}" \ + bash /tmp/install-rocm-nightly.sh && \ + rm -f /tmp/install-rocm-nightly.sh # Update render group GID to match host for GPU access. RUN groupmod -g 109 render diff --git a/upstream-buildbots/manylinux-build-only/install-rocm-nightly.sh b/upstream-buildbots/manylinux-build-only/install-rocm-nightly.sh new file mode 100644 index 000000000..7ff653601 --- /dev/null +++ b/upstream-buildbots/manylinux-build-only/install-rocm-nightly.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# shellcheck shell=bash +# +# Install a TheRock ROCm "dist" nightly tarball into a target directory. +# +# The full dist tarball contains the complete ROCm SDK (HIP runtime, device +# libs, rocminfo, rocblas, rocthrust, ...), so it fully replaces the previous +# yum/rocm.repo based install used by the manylinux buildbot images. +# +# Inputs are read from the environment so the Dockerfile can wire them through +# build ARGs: +# ROCM_BASE_VERSION base ROCm version, e.g. "7.14" (default 7.14) +# ROCM_GFX gfx target family, e.g. "gfx90a" (default gfx90a) +# ROCM_NIGHTLY_DATE pin a build date YYYYMMDD; empty => (default empty) +# auto-detect the most recent nightly + +set -euo pipefail + +RocmBaseVersion="${ROCM_BASE_VERSION:-7.14}" +RocmGfx="${ROCM_GFX:-gfx90a}" +RocmNightlyDate="${ROCM_NIGHTLY_DATE:-}" +RocmInstallDir="/opt/rocm" +RocmNightlyBaseUrl="https://rocm.nightlies.amd.com/tarball" + +if [ -n "${RocmNightlyDate}" ] && [[ ! "${RocmNightlyDate}" =~ ^[0-9]{8}$ ]]; then + echo "error: ROCM_NIGHTLY_DATE must be an 8-digit date (YYYYMMDD), got '${RocmNightlyDate}'" >&2 + exit 1 +fi + +# Shared curl options: fail on HTTP errors, stay quiet but show errors, follow +# redirects, bound the connection setup, and retry transient failures. The +# per-call --max-time bounds the whole transfer and is set at each call site +# because the index fetch and the multi-GB tarball download need very different +# ceilings. +CurlOpts=(--fail --silent --show-error --location --connect-timeout 30 --retry 5 --retry-delay 5) + +# Resolve the tarball filename, either from a pinned date or by querying the +# nightly index for the most recent build matching version + gfx. +function resolveTarball() { + local Prefix="therock-dist-linux-${RocmGfx}-${RocmBaseVersion}.0a" + + if [ -n "${RocmNightlyDate}" ]; then + echo "${Prefix}${RocmNightlyDate}.tar.gz" + return 0 + fi + + # The index page embeds the available files; extract every name matching our + # version+gfx, sort lexically (dates are zero-padded YYYYMMDD), take newest. + local Latest + Latest="$(curl "${CurlOpts[@]}" --max-time 60 "${RocmNightlyBaseUrl}/" | + grep -oP "${Prefix}[0-9]{8}\.tar\.gz" | + sort -u | + tail -1)" + + if [ -z "${Latest}" ]; then + echo "error: no nightly tarball found for ${RocmGfx} version ${RocmBaseVersion} at ${RocmNightlyBaseUrl}/" >&2 + return 1 + fi + + echo "${Latest}" +} + +function doInstall() { + local Tarball + Tarball="$(resolveTarball)" + local Url="${RocmNightlyBaseUrl}/${Tarball}" + + echo "Installing ROCm nightly: ${Tarball}" + echo " from: ${Url}" + echo " into: ${RocmInstallDir}" + + local TmpDir + TmpDir="$(mktemp -d)" + # shellcheck disable=SC2064 + trap "rm -rf '${TmpDir}'" EXIT + + # The tarball can be multiple GB; allow up to 30 minutes for the transfer. + curl "${CurlOpts[@]}" --max-time 1800 -o "${TmpDir}/${Tarball}" "${Url}" + mkdir -p "${RocmInstallDir}" + tar -xf "${TmpDir}/${Tarball}" -C "${RocmInstallDir}" + + # Record what was actually installed. TheRock's own /opt/rocm/.info/version + # only carries the base version (e.g. 7.14.0), which cannot distinguish + # nightlies. The trailing 8 digits of the tarball name are the build date. + local Date="${Tarball: -15:8}" + local InfoDir="${RocmInstallDir}/.info" + if [ ! -d "${InfoDir}" ]; then + echo "error: expected ${InfoDir} to exist after extraction; the tarball may be incomplete or have an unexpected layout" >&2 + return 1 + fi + { + echo "tarball=${Tarball}" + echo "url=${Url}" + echo "base_version=${RocmBaseVersion}" + echo "gfx=${RocmGfx}" + echo "date=${Date}" + } >"${InfoDir}/nightly" + + echo "ROCm nightly install complete." +} + +doInstall diff --git a/upstream-buildbots/manylinux-build-only/rocm.repo b/upstream-buildbots/manylinux-build-only/rocm.repo deleted file mode 100644 index a25b014e9..000000000 --- a/upstream-buildbots/manylinux-build-only/rocm.repo +++ /dev/null @@ -1,7 +0,0 @@ -[rocm] -name=ROCm 7.1.1 repository -baseurl=https://repo.radeon.com/rocm/el8/7.1.1/main -enabled=1 -priority=50 -gpgcheck=1 -gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key diff --git a/upstream-buildbots/manylinux-hip-tpl/Dockerfile b/upstream-buildbots/manylinux-hip-tpl/Dockerfile index 51e8d46ba..0a5a9b4fb 100644 --- a/upstream-buildbots/manylinux-hip-tpl/Dockerfile +++ b/upstream-buildbots/manylinux-hip-tpl/Dockerfile @@ -6,22 +6,21 @@ FROM localhost/manylinux:base ARG WORKBASE="/opt" -# Add ROCm repository. -COPY rocm.repo /etc/yum.repos.d/rocm.repo -RUN yum clean packages && yum clean all - -# Install minimal ROCm components for buildbot. -RUN yum install -y \ - rocm-device-libs \ - rocm-core \ - rocminfo \ - hsa-rocr-devel7.1.1 \ - rocm-hip-runtime7.1.1.x86_64 \ - rocm-hip-runtime-devel7.1.1.x86_64 \ - rocthrust-devel \ - rocblas \ - && yum clean all && \ - rm -rf /var/cache/yum +# Install ROCm from a TheRock nightly dist tarball. +# The full dist tarball already contains the HIP runtime, rocblas, rocthrust, +# etc., so no extra ROCm packages are needed here. +# By default the most recent nightly for the given base version + gfx is used; +# pin a specific build with --build-arg ROCM_NIGHTLY_DATE=YYYYMMDD. +ARG ROCM_BASE_VERSION=7.14 +ARG ROCM_GFX=gfx90a +ARG ROCM_NIGHTLY_DATE= + +COPY install-rocm-nightly.sh /tmp/install-rocm-nightly.sh +RUN ROCM_BASE_VERSION="${ROCM_BASE_VERSION}" \ + ROCM_GFX="${ROCM_GFX}" \ + ROCM_NIGHTLY_DATE="${ROCM_NIGHTLY_DATE}" \ + bash /tmp/install-rocm-nightly.sh && \ + rm -f /tmp/install-rocm-nightly.sh # Update render group GID to match host for GPU access. RUN groupmod -g 109 render diff --git a/upstream-buildbots/manylinux-hip-tpl/install-rocm-nightly.sh b/upstream-buildbots/manylinux-hip-tpl/install-rocm-nightly.sh new file mode 100644 index 000000000..7ff653601 --- /dev/null +++ b/upstream-buildbots/manylinux-hip-tpl/install-rocm-nightly.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# shellcheck shell=bash +# +# Install a TheRock ROCm "dist" nightly tarball into a target directory. +# +# The full dist tarball contains the complete ROCm SDK (HIP runtime, device +# libs, rocminfo, rocblas, rocthrust, ...), so it fully replaces the previous +# yum/rocm.repo based install used by the manylinux buildbot images. +# +# Inputs are read from the environment so the Dockerfile can wire them through +# build ARGs: +# ROCM_BASE_VERSION base ROCm version, e.g. "7.14" (default 7.14) +# ROCM_GFX gfx target family, e.g. "gfx90a" (default gfx90a) +# ROCM_NIGHTLY_DATE pin a build date YYYYMMDD; empty => (default empty) +# auto-detect the most recent nightly + +set -euo pipefail + +RocmBaseVersion="${ROCM_BASE_VERSION:-7.14}" +RocmGfx="${ROCM_GFX:-gfx90a}" +RocmNightlyDate="${ROCM_NIGHTLY_DATE:-}" +RocmInstallDir="/opt/rocm" +RocmNightlyBaseUrl="https://rocm.nightlies.amd.com/tarball" + +if [ -n "${RocmNightlyDate}" ] && [[ ! "${RocmNightlyDate}" =~ ^[0-9]{8}$ ]]; then + echo "error: ROCM_NIGHTLY_DATE must be an 8-digit date (YYYYMMDD), got '${RocmNightlyDate}'" >&2 + exit 1 +fi + +# Shared curl options: fail on HTTP errors, stay quiet but show errors, follow +# redirects, bound the connection setup, and retry transient failures. The +# per-call --max-time bounds the whole transfer and is set at each call site +# because the index fetch and the multi-GB tarball download need very different +# ceilings. +CurlOpts=(--fail --silent --show-error --location --connect-timeout 30 --retry 5 --retry-delay 5) + +# Resolve the tarball filename, either from a pinned date or by querying the +# nightly index for the most recent build matching version + gfx. +function resolveTarball() { + local Prefix="therock-dist-linux-${RocmGfx}-${RocmBaseVersion}.0a" + + if [ -n "${RocmNightlyDate}" ]; then + echo "${Prefix}${RocmNightlyDate}.tar.gz" + return 0 + fi + + # The index page embeds the available files; extract every name matching our + # version+gfx, sort lexically (dates are zero-padded YYYYMMDD), take newest. + local Latest + Latest="$(curl "${CurlOpts[@]}" --max-time 60 "${RocmNightlyBaseUrl}/" | + grep -oP "${Prefix}[0-9]{8}\.tar\.gz" | + sort -u | + tail -1)" + + if [ -z "${Latest}" ]; then + echo "error: no nightly tarball found for ${RocmGfx} version ${RocmBaseVersion} at ${RocmNightlyBaseUrl}/" >&2 + return 1 + fi + + echo "${Latest}" +} + +function doInstall() { + local Tarball + Tarball="$(resolveTarball)" + local Url="${RocmNightlyBaseUrl}/${Tarball}" + + echo "Installing ROCm nightly: ${Tarball}" + echo " from: ${Url}" + echo " into: ${RocmInstallDir}" + + local TmpDir + TmpDir="$(mktemp -d)" + # shellcheck disable=SC2064 + trap "rm -rf '${TmpDir}'" EXIT + + # The tarball can be multiple GB; allow up to 30 minutes for the transfer. + curl "${CurlOpts[@]}" --max-time 1800 -o "${TmpDir}/${Tarball}" "${Url}" + mkdir -p "${RocmInstallDir}" + tar -xf "${TmpDir}/${Tarball}" -C "${RocmInstallDir}" + + # Record what was actually installed. TheRock's own /opt/rocm/.info/version + # only carries the base version (e.g. 7.14.0), which cannot distinguish + # nightlies. The trailing 8 digits of the tarball name are the build date. + local Date="${Tarball: -15:8}" + local InfoDir="${RocmInstallDir}/.info" + if [ ! -d "${InfoDir}" ]; then + echo "error: expected ${InfoDir} to exist after extraction; the tarball may be incomplete or have an unexpected layout" >&2 + return 1 + fi + { + echo "tarball=${Tarball}" + echo "url=${Url}" + echo "base_version=${RocmBaseVersion}" + echo "gfx=${RocmGfx}" + echo "date=${Date}" + } >"${InfoDir}/nightly" + + echo "ROCm nightly install complete." +} + +doInstall diff --git a/upstream-buildbots/manylinux-hip-tpl/rocm.repo b/upstream-buildbots/manylinux-hip-tpl/rocm.repo deleted file mode 100644 index a25b014e9..000000000 --- a/upstream-buildbots/manylinux-hip-tpl/rocm.repo +++ /dev/null @@ -1,7 +0,0 @@ -[rocm] -name=ROCm 7.1.1 repository -baseurl=https://repo.radeon.com/rocm/el8/7.1.1/main -enabled=1 -priority=50 -gpgcheck=1 -gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key