From 936345274273854e814f1600400991df6dc243fe Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Sun, 14 Jun 2026 13:54:55 -0600 Subject: [PATCH 1/2] Fix corruption due to lock sharding issues by centralizing locking --- .github/workflows/dynamic_arch.yml | 236 ++++++++++++++++++ CMakeLists.txt | 6 +- Makefile.install | 2 +- Makefile.rule | 6 +- common_thread.h | 3 + cpp_thread_test/CMakeLists.txt | 36 ++- cpp_thread_test/Makefile | 12 +- cpp_thread_test/cpp_thread_safety_common.h | 29 +++ cpp_thread_test/dgemm_thread_safety.cpp | 13 +- cpp_thread_test/dgemm_thread_safety_mixed.cpp | 139 +++++++++++ cpp_thread_test/dgemv_thread_safety.cpp | 13 +- driver/level3/CMakeLists.txt | 2 +- driver/level3/Makefile | 8 +- driver/level3/level3_gemm3m_thread.c | 25 +- driver/level3/level3_syrk_threaded.c | 71 +----- driver/level3/level3_thread.c | 71 +----- driver/level3/level3_thread_lock.c | 124 +++++++++ 17 files changed, 604 insertions(+), 192 deletions(-) create mode 100644 cpp_thread_test/dgemm_thread_safety_mixed.cpp create mode 100644 driver/level3/level3_thread_lock.c diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 5c39a7d407..523f2bdb42 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -334,6 +334,242 @@ jobs: echo "::endgroup::" + linux_thread_stress: + if: "github.repository == 'OpenMathLib/OpenBLAS'" + name: ${{ matrix.check-name }} + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + include: + - backend: pthread + check-name: "linux_thread_stress (pthread)" + - backend: openmp + check-name: "linux_thread_stress (openmp)" + - backend: tsan + check-name: linux_thread_sanitizer + + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Install Dependencies + run: | + cat << EOF | sudo tee -a /etc/apt/apt.conf.d/01norecommend + APT::Install-Recommends "0"; + APT::Install-Suggests "0"; + EOF + sudo apt-get update + sudo apt-get install -y ccache cmake ninja-build + if [ "${{ matrix.backend }}" = "tsan" ]; then + sudo apt-get install -y clang llvm + fi + + - name: Compilation cache + uses: actions/cache@v5 + with: + path: ~/.ccache + key: ccache-${{ runner.os }}-thread-${{ matrix.backend }}-${{ github.ref }}-${{ github.sha }} + restore-keys: | + ccache-${{ runner.os }}-thread-${{ matrix.backend }}-${{ github.ref }} + ccache-${{ runner.os }}-thread-${{ matrix.backend }} + + - name: Configure ccache + # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota. + run: | + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 250M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: Configure OpenBLAS + run: | + mkdir build && cd build + build_type=Release + c_compiler=gcc + cxx_compiler=g++ + dynamic_arch=ON + use_openmp=OFF + cpp_thread_safety_use_openmp=ON + dgemm_args="512;12;4" + dgemm_mixed_args="524288;16;20" + dgemv_args="512;12;4" + sanitizer_flags= + if [ "${{ matrix.backend }}" = "openmp" ]; then + use_openmp=ON + elif [ "${{ matrix.backend }}" = "tsan" ]; then + build_type=RelWithDebInfo + c_compiler=clang + cxx_compiler=clang++ + dynamic_arch=OFF + cpp_thread_safety_use_openmp=OFF + dgemm_args="64;4;1" + dgemm_mixed_args="131072;8;10" + dgemv_args="64;4;1" + sanitizer_flags="-fsanitize=thread -g -O1 -fno-omit-frame-pointer" + fi + cmake_args=( + -G Ninja + "-DCMAKE_BUILD_TYPE=$build_type" + "-DCMAKE_C_COMPILER=$c_compiler" + "-DCMAKE_CXX_COMPILER=$cxx_compiler" + -DBUILD_SHARED_LIBS=ON + -DBUILD_STATIC_LIBS=OFF + -DBUILD_WITHOUT_LAPACK=ON + "-DDYNAMIC_ARCH=$dynamic_arch" + -DNOFORTRAN=ON + -DUSE_THREAD=ON + "-DUSE_OPENMP=$use_openmp" + -DNUM_THREADS=32 + -DNUM_PARALLEL=2 + -DTARGET=CORE2 + -DCPP_THREAD_SAFETY_TEST=ON + "-DCPP_THREAD_SAFETY_USE_OPENMP=$cpp_thread_safety_use_openmp" + "-DCPP_THREAD_SAFETY_DGEMM_ARGS=$dgemm_args" + "-DCPP_THREAD_SAFETY_DGEMM_MIXED_ARGS=$dgemm_mixed_args" + "-DCPP_THREAD_SAFETY_DGEMV_ARGS=$dgemv_args" + -DCMAKE_C_COMPILER_LAUNCHER=ccache + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache + ) + if [ "${{ matrix.backend }}" = "tsan" ]; then + cmake_args+=( + "-DCMAKE_C_FLAGS=$sanitizer_flags" + "-DCMAKE_CXX_FLAGS=$sanitizer_flags" + -DCMAKE_SHARED_LINKER_FLAGS=-fsanitize=thread + -DCMAKE_EXE_LINKER_FLAGS=-fsanitize=thread + ) + fi + cmake "${cmake_args[@]}" .. + + - name: Build OpenBLAS + run: cd build && cmake --build . + + - name: Show ccache status + continue-on-error: true + run: ccache -s + + - name: Run thread stress tests + timeout-minutes: 30 + run: | + cd build + if [ "${{ matrix.backend }}" = "tsan" ]; then + export OPENBLAS_NUM_THREADS=2 + export LLVM_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer + export TSAN_OPTIONS=halt_on_error=1:exitcode=66:second_deadlock_stack=1 + else + export OPENBLAS_NUM_THREADS=8 + export OMP_NUM_THREADS=16 + fi + ctest -R 'dgemm_thread_safety|dgemm_thread_safety_mixed|dgemv_thread_safety' --output-on-failure + + + msys2_thread_stress: + if: "github.repository == 'OpenMathLib/OpenBLAS'" + runs-on: windows-latest + + defaults: + run: + shell: msys2 {0} + + env: + CHERE_INVOKING: 1 + + steps: + - name: Get CPU name + shell: pwsh + run : | + Get-CIMInstance -Class Win32_Processor | Select-Object -Property Name + + - name: Install build dependencies + uses: msys2/setup-msys2@v2 + with: + msystem: UCRT64 + update: true + release: false # Use pre-installed version + install: >- + base-devel + mingw-w64-ucrt-x86_64-cc + mingw-w64-ucrt-x86_64-cmake + mingw-w64-ucrt-x86_64-ninja + mingw-w64-ucrt-x86_64-ccache + + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Prepare ccache + # Get cache location of ccache + # Create key that is used in action/cache/restore and action/cache/save steps + id: ccache-prepare + run: | + echo "ccachedir=$(cygpath -m $(ccache -k cache_dir))" >> $GITHUB_OUTPUT + # We include the commit sha in the cache key, as new cache entries are + # only created if there is no existing entry for the key yet. + echo "key=ccache-msys2-thread-stress-${{ github.ref }}-${{ github.sha }}" >> $GITHUB_OUTPUT + + - name: Restore ccache + uses: actions/cache/restore@v5 + with: + path: ${{ steps.ccache-prepare.outputs.ccachedir }} + key: ${{ steps.ccache-prepare.outputs.key }} + # Restore a matching ccache cache entry. Prefer same branch. + restore-keys: | + ccache-msys2-thread-stress-${{ github.ref }} + ccache-msys2-thread-stress + + - name: Configure ccache + # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota. + run: | + which ccache + test -d ${{ steps.ccache-prepare.outputs.ccachedir }} || mkdir -p ${{ steps.ccache-prepare.outputs.ccachedir }} + echo "max_size = 250M" > ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf + echo "compression = true" >> ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf + ccache -p + ccache -s + + - name: Configure OpenBLAS + run: | + mkdir build && cd build + cmake -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=ON \ + -DBUILD_STATIC_LIBS=OFF \ + -DBUILD_WITHOUT_LAPACK=ON \ + -DDYNAMIC_ARCH=ON \ + -DNOFORTRAN=ON \ + -DUSE_THREAD=ON \ + -DUSE_OPENMP=OFF \ + -DNUM_THREADS=32 \ + -DTARGET=CORE2 \ + -DCPP_THREAD_SAFETY_TEST=ON \ + -DCPP_THREAD_SAFETY_DGEMM_ARGS="384;8;4" \ + -DCPP_THREAD_SAFETY_DGEMM_MIXED_ARGS="524288;16;20" \ + -DCPP_THREAD_SAFETY_DGEMV_ARGS="384;8;4" \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + .. + + - name: Build OpenBLAS + run: cd build && cmake --build . + + - name: Show ccache status + continue-on-error: true + run: ccache -s + + - name: Save ccache + # Save the cache after we are done (successfully) building + uses: actions/cache/save@v5 + with: + path: ${{ steps.ccache-prepare.outputs.ccachedir }} + key: ${{ steps.ccache-prepare.outputs.key }} + + - name: Run thread stress tests + timeout-minutes: 30 + run: | + cd build + OPENBLAS_NUM_THREADS=8 OMP_NUM_THREADS=16 ctest -R 'dgemm_thread_safety|dgemm_thread_safety_mixed|dgemv_thread_safety' --output-on-failure + + cross_build: if: "github.repository == 'OpenMathLib/OpenBLAS'" runs-on: ubuntu-22.04 diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a145ac6be..c88d9dd3e1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,9 +58,10 @@ else() set(NO_AFFINITY 1) endif() -option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF) +option(CPP_THREAD_SAFETY_TEST "Run massively parallel DGEMM tests to confirm thread safety of the library (requires about 1.3GB of RAM)" OFF) +option(CPP_THREAD_SAFETY_USE_OPENMP "Use OpenMP to launch the C++ thread safety tests" ON) -option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) +option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library" OFF) option(BUILD_STATIC_LIBS "Build static library" OFF) option(BUILD_SHARED_LIBS "Build shared library" OFF) if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS) @@ -820,4 +821,3 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake install(EXPORT "${PN}${SUFFIX64}Targets" NAMESPACE "${PN}${SUFFIX64}::" DESTINATION ${CMAKECONFIG_INSTALL_DIR}) - diff --git a/Makefile.install b/Makefile.install index e6720ccf2d..05471b2c83 100644 --- a/Makefile.install +++ b/Makefile.install @@ -355,7 +355,7 @@ endif endif ifeq ($(CPP_THREAD_SAFETY_TEST), 1) @install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 cpp_thread_test/dgemm_mixed_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) @install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) endif endif - diff --git a/Makefile.rule b/Makefile.rule index 92079e4dac..2e4d3ce367 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -290,10 +290,10 @@ COMMON_PROF = -pg # This is mostly intended as a developer feature to spot regressions, but users and # package maintainers can enable this if they have doubts about the thread safety of # the library, given the configuration in this file. -# By default, the thread safety tester launches 52 concurrent calculations at the same -# time. +# By default, the thread safety testers launch many concurrent calculations at +# the same time. # -# Please note that the test uses ~1300 MiB of RAM for the DGEMM test. +# Please note that the tests use ~1300 MiB of RAM for the DGEMM test. # # The test requires CBLAS to be built, a C++11 capable compiler and the presence of # an OpenMP implementation. If you are cross-compiling this test will probably not diff --git a/common_thread.h b/common_thread.h index 4a8db682bf..633d5516d1 100644 --- a/common_thread.h +++ b/common_thread.h @@ -191,6 +191,9 @@ int exec_blas(BLASLONG num_cpu, blas_param_t *param, void *buffer); #ifndef ASSEMBLER +void blas_level3_thread_enter(void); +void blas_level3_thread_leave(void); + int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, diff --git a/cpp_thread_test/CMakeLists.txt b/cpp_thread_test/CMakeLists.txt index 5eccb12ceb..5271d4594c 100644 --- a/cpp_thread_test/CMakeLists.txt +++ b/cpp_thread_test/CMakeLists.txt @@ -5,19 +5,35 @@ enable_language(CXX) set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") -if (USE_OPENMP) -if (CPP_THREAD_SAFETY_TEST) - message(STATUS building thread safety test) - add_executable(dgemm_thread_safety dgemm_thread_safety.cpp) - target_link_libraries(dgemm_thread_safety ${OpenBLAS_LIBNAME}) - add_test( dgemm_thread_safety ${CMAKE_CURRENT_BINARY_DIR}/dgemm_thread_safety) +set(CPP_THREAD_SAFETY_LIBS ${OpenBLAS_LIBNAME}) +find_package(Threads REQUIRED) +list(APPEND CPP_THREAD_SAFETY_LIBS Threads::Threads) +add_definitions(-DOPENBLAS_USE_GENERATED_CBLAS_H) + +if ((CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) AND CPP_THREAD_SAFETY_USE_OPENMP) + find_package(OpenMP REQUIRED COMPONENTS CXX) + list(APPEND CPP_THREAD_SAFETY_LIBS OpenMP::OpenMP_CXX) + add_definitions(-DCPP_THREAD_SAFETY_USE_OPENMP) endif() +set(CPP_THREAD_SAFETY_DGEMM_ARGS "" CACHE STRING "Arguments passed to the DGEMM thread safety test") +set(CPP_THREAD_SAFETY_DGEMM_MIXED_ARGS "" CACHE STRING "Arguments passed to the mixed DGEMM thread safety test") +set(CPP_THREAD_SAFETY_DGEMV_ARGS "" CACHE STRING "Arguments passed to the DGEMV thread safety test") -if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) - add_executable(dgemv_thread_safety dgemv_thread_safety.cpp) - target_link_libraries(dgemv_thread_safety ${OpenBLAS_LIBNAME}) - add_test(dgemv_thread_safety ${CMAKE_CURRENT_BINARY_DIR}/dgemv_thread_safety) +if (CPP_THREAD_SAFETY_TEST) + message(STATUS "building thread safety test") + add_executable(dgemm_thread_safety dgemm_thread_safety.cpp) + target_link_libraries(dgemm_thread_safety ${CPP_THREAD_SAFETY_LIBS}) + add_test(NAME dgemm_thread_safety COMMAND ${CMAKE_CURRENT_BINARY_DIR}/dgemm_thread_safety ${CPP_THREAD_SAFETY_DGEMM_ARGS}) + + add_executable(dgemm_thread_safety_mixed dgemm_thread_safety_mixed.cpp) + target_link_libraries(dgemm_thread_safety_mixed ${CPP_THREAD_SAFETY_LIBS}) + add_test(NAME dgemm_thread_safety_mixed COMMAND ${CMAKE_CURRENT_BINARY_DIR}/dgemm_thread_safety_mixed ${CPP_THREAD_SAFETY_DGEMM_MIXED_ARGS}) endif() + +if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) + add_executable(dgemv_thread_safety dgemv_thread_safety.cpp) + target_link_libraries(dgemv_thread_safety ${CPP_THREAD_SAFETY_LIBS}) + add_test(NAME dgemv_thread_safety COMMAND ${CMAKE_CURRENT_BINARY_DIR}/dgemv_thread_safety ${CPP_THREAD_SAFETY_DGEMV_ARGS}) endif() diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile index be8313e658..fe7a286251 100644 --- a/cpp_thread_test/Makefile +++ b/cpp_thread_test/Makefile @@ -1,15 +1,19 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system -all :: dgemv_tester dgemm_tester +all :: dgemv_tester dgemm_tester dgemm_mixed_tester dgemv_tester : - $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester + $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -std=c++11 dgemv_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester ./dgemv_tester dgemm_tester : dgemv_tester - $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester + $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -std=c++11 dgemm_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester ./dgemm_tester +dgemm_mixed_tester : dgemm_tester + $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -std=c++11 dgemm_thread_safety_mixed.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemm_mixed_tester + ./dgemm_mixed_tester + clean :: - rm -f dgemv_tester dgemm_tester + rm -f dgemv_tester dgemm_tester dgemm_mixed_tester diff --git a/cpp_thread_test/cpp_thread_safety_common.h b/cpp_thread_test/cpp_thread_safety_common.h index 8005369a8a..aaa68d9046 100644 --- a/cpp_thread_test/cpp_thread_safety_common.h +++ b/cpp_thread_test/cpp_thread_safety_common.h @@ -1,3 +1,9 @@ +#ifdef CPP_THREAD_SAFETY_USE_OPENMP +#include +#else +#include +#endif + inline void pauser(){ /// a portable way to pause a program std::string dummy; @@ -13,6 +19,29 @@ void FailIfThreadsAreZero(uint32_t numConcurrentThreads) { } } +uint32_t GetMaxHwThreads() { +#ifdef CPP_THREAD_SAFETY_USE_OPENMP + return omp_get_max_threads(); +#else + const uint32_t maxHwThreads = std::thread::hardware_concurrency(); + return maxHwThreads == 0 ? 1 : maxHwThreads; +#endif +} + +void SetLauncherThreads(uint32_t numConcurrentThreads) { +#ifdef CPP_THREAD_SAFETY_USE_OPENMP + omp_set_num_threads(numConcurrentThreads); +#endif +} + +const char *LauncherName() { +#ifdef CPP_THREAD_SAFETY_USE_OPENMP + return " using OpenMP"; +#else + return ""; +#endif +} + void FillMatrices(std::vector>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){ for(uint32_t i=0; i(randomMatSize*randomMatSize); j++){ diff --git a/cpp_thread_test/dgemm_thread_safety.cpp b/cpp_thread_test/dgemm_thread_safety.cpp index 104c64f2ac..2a9f4c37a6 100644 --- a/cpp_thread_test/dgemm_thread_safety.cpp +++ b/cpp_thread_test/dgemm_thread_safety.cpp @@ -2,8 +2,11 @@ #include #include #include -#include +#ifdef OPENBLAS_USE_GENERATED_CBLAS_H +#include "generated/cblas.h" +#else #include "../cblas.h" +#endif #include "cpp_thread_safety_common.h" void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){ @@ -14,7 +17,7 @@ int main(int argc, char* argv[]){ blasint randomMatSize = 1024; //dimension of the random square matrices used uint32_t numConcurrentThreads = 96; //number of concurrent calls of the functions being tested uint32_t numTestRounds = 16; //number of testing rounds before success exit - uint32_t maxHwThreads = omp_get_max_threads(); + uint32_t maxHwThreads = GetMaxHwThreads(); if (maxHwThreads < 96) numConcurrentThreads = maxHwThreads; @@ -65,11 +68,13 @@ int main(int argc, char* argv[]){ //PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3); std::cout<<"done\n"; std::cout<<"Testing CBLAS DGEMM thread safety\n"; - omp_set_num_threads(numConcurrentThreads); + SetLauncherThreads(numConcurrentThreads); for(uint32_t R=0; R +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef OPENBLAS_USE_GENERATED_CBLAS_H +#include "generated/cblas.h" +#else +#include "../cblas.h" +#endif +#include "cpp_thread_safety_common.h" + +void compute_dgemm_pair(std::vector& transA, std::vector& noTransA, std::vector& B, double* firstOutput, double* secondOutput, const blasint randomMatSize, const bool sameVariant){ + cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, randomMatSize, 2, 2, 1.0, &transA[0], randomMatSize, &B[0], 2, 0.0, firstOutput, 2); + if (sameVariant) + cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, randomMatSize, 2, 4, 1.0, &transA[0], randomMatSize, &B[0], 2, 0.0, secondOutput, 2); + else + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, randomMatSize, 2, 4, 1.0, &noTransA[0], 4, &B[0], 2, 0.0, secondOutput, 2); +} + +void run_worker(std::vector& transA, std::vector& noTransA, std::vector& B, const std::vector& referenceFirst, const std::vector& referenceSecond, const blasint randomMatSize, const uint32_t numTestRounds, const bool sameVariant, std::atomic& readyThreads, std::atomic& startThreads, uint32_t& mismatches){ + std::vector firstOutput(static_cast(randomMatSize) * 2); + std::vector secondOutput(static_cast(randomMatSize) * 2); + const size_t outputBytes = static_cast(randomMatSize) * 2 * sizeof(double); + uint32_t localMismatches = 0; + + readyThreads.fetch_add(1); + while (!startThreads.load()) + std::this_thread::yield(); + + for(uint32_t i=0; i [sameVariant]"< cliArgs; + for (int i = 1; i < argc; i++){ + cliArgs.push_back(argv[i]); + std::cout<(randomMatSize) * 4; + const size_t outputElements = static_cast(randomMatSize) * 2; + std::vector transA(matrixElements); + std::vector noTransA(matrixElements); + std::vector B(8); + std::vector referenceFirst(outputElements); + std::vector referenceSecond(outputElements); + std::vector threads(numConcurrentThreads); + std::vector mismatchBlock(numConcurrentThreads); + std::atomic readyThreads(0); + std::atomic startThreads(false); + + std::cout<<"*----------------------------------*\n"; + std::cout<<"| Mixed DGEMM thread safety tester |\n"; + std::cout<<"*----------------------------------*\n"; + std::cout<<"Tall-skinny DGEMM M dimension: "<(matrixElements) * 2 * 8 + static_cast(outputElements) * (2 + 2 * numConcurrentThreads) * 8)/static_cast(1024*1024)<<" MiB of RAM\n"<(i % 512); + noTransA[i] = static_cast((i * 7) % 512); + } + std::cout<<"done\n"; + std::cout<<"Filling RHS matrix..."<(i + 1); + } + std::cout<<"done\n"; + + std::cout<<"Computing reference results..."< #include #include -#include +#ifdef OPENBLAS_USE_GENERATED_CBLAS_H +#include "generated/cblas.h" +#else #include "../cblas.h" +#endif #include "cpp_thread_safety_common.h" void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize) @@ -17,7 +20,7 @@ int main(int argc, char* argv[]) blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested uint32_t numTestRounds = 16; //number of testing rounds before success exit - uint32_t maxHwThreads = omp_get_max_threads(); + uint32_t maxHwThreads = GetMaxHwThreads(); if (maxHwThreads < 52) numConcurrentThreads = maxHwThreads; @@ -84,12 +87,14 @@ int main(int argc, char* argv[]) std::cout<<"done\n"; std::cout<<"Testing CBLAS DGEMV thread safety"< m; newarg.n = args -> n; @@ -994,13 +979,7 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); free(job); #endif -#ifndef USE_OPENMP -#ifndef OS_WINDOWS - pthread_mutex_unlock(&level3_lock); -#else - LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock); -#endif -#endif + blas_level3_thread_leave(); return 0; } diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c index 47f303c1d2..88edc7e2a6 100644 --- a/driver/level3/level3_syrk_threaded.c +++ b/driver/level3/level3_syrk_threaded.c @@ -506,33 +506,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ -#ifdef USE_OPENMP - static omp_lock_t level3_lock, critical_section_lock; - static volatile BLASULONG init_lock = 0, omp_lock_initialized = 0, - parallel_section_left = MAX_PARALLEL_NUMBER; - - // Lock initialization; Todo : Maybe this part can be moved to blas_init() in blas_server_omp.c - while(omp_lock_initialized == 0) - { - blas_lock(&init_lock); - { - if(omp_lock_initialized == 0) - { - omp_init_lock(&level3_lock); - omp_init_lock(&critical_section_lock); - omp_lock_initialized = 1; - WMB; - } - blas_unlock(&init_lock); - } - } -#elif defined(OS_WINDOWS) - CRITICAL_SECTION level3_lock; - InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); -#else - static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; -#endif - blas_arg_t newarg; #ifndef USE_ALLOC_HEAP @@ -589,29 +562,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #endif #endif -#ifdef USE_OPENMP - omp_set_lock(&level3_lock); - omp_set_lock(&critical_section_lock); - - parallel_section_left--; - - /* - How OpenMP locks works with NUM_PARALLEL - 1) parallel_section_left = Number of available concurrent executions of OpenBLAS - Number of currently executing OpenBLAS executions - 2) level3_lock is acting like a master lock or barrier which stops OpenBLAS calls when all the parallel_section are currently busy executing other OpenBLAS calls - 3) critical_section_lock is used for updating variables shared between threads executing OpenBLAS calls concurrently and for unlocking of master lock whenever required - 4) Unlock master lock only when we have not already exhausted all the parallel_sections and allow another thread with a OpenBLAS call to enter - */ - if(parallel_section_left != 0) - omp_unset_lock(&level3_lock); - - omp_unset_lock(&critical_section_lock); - -#elif defined(OS_WINDOWS) - EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); -#else - pthread_mutex_lock(&level3_lock); -#endif + blas_level3_thread_enter(); newarg.m = args -> m; newarg.n = args -> n; @@ -759,25 +710,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO free(job); #endif -#ifdef USE_OPENMP - omp_set_lock(&critical_section_lock); - parallel_section_left++; - - /* - Unlock master lock only when all the parallel_sections are already exhausted and one of the thread has completed its OpenBLAS call - otherwise just increment the parallel_section_left - The master lock is only locked when we have exhausted all the parallel_sections, So only unlock it then and otherwise just increment the count - */ - if(parallel_section_left == 1) - omp_unset_lock(&level3_lock); - - omp_unset_lock(&critical_section_lock); - -#elif defined(OS_WINDOWS) - LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock); -#else - pthread_mutex_unlock(&level3_lock); -#endif + blas_level3_thread_leave(); return 0; } diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 2657bbcfbe..973de07ae3 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -569,33 +569,6 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG nthreads_m, BLASLONG nthreads_n) { -#ifdef USE_OPENMP - static omp_lock_t level3_lock, critical_section_lock; - static volatile BLASULONG init_lock = 0, omp_lock_initialized = 0, - parallel_section_left = MAX_PARALLEL_NUMBER; - - // Lock initialization; Todo : Maybe this part can be moved to blas_init() in blas_server_omp.c - while(omp_lock_initialized == 0) - { - blas_lock(&init_lock); - { - if(omp_lock_initialized == 0) - { - omp_init_lock(&level3_lock); - omp_init_lock(&critical_section_lock); - omp_lock_initialized = 1; - WMB; - } - blas_unlock(&init_lock); - } - } -#elif defined(OS_WINDOWS) - CRITICAL_SECTION level3_lock; - InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); -#else - static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; -#endif - blas_arg_t newarg; #ifndef USE_ALLOC_HEAP @@ -641,29 +614,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG #endif #endif -#ifdef USE_OPENMP - omp_set_lock(&level3_lock); - omp_set_lock(&critical_section_lock); - - parallel_section_left--; - - /* - How OpenMP locks works with NUM_PARALLEL - 1) parallel_section_left = Number of available concurrent executions of OpenBLAS - Number of currently executing OpenBLAS executions - 2) level3_lock is acting like a master lock or barrier which stops OpenBLAS calls when all the parallel_section are currently busy executing other OpenBLAS calls - 3) critical_section_lock is used for updating variables shared between threads executing OpenBLAS calls concurrently and for unlocking of master lock whenever required - 4) Unlock master lock only when we have not already exhausted all the parallel_sections and allow another thread with a OpenBLAS call to enter - */ - if(parallel_section_left != 0) - omp_unset_lock(&level3_lock); - - omp_unset_lock(&critical_section_lock); - -#elif defined(OS_WINDOWS) - EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); -#else - pthread_mutex_lock(&level3_lock); -#endif + blas_level3_thread_enter(); #ifdef USE_ALLOC_HEAP /* Dynamically allocate workspace */ @@ -797,25 +748,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG free(job); #endif -#ifdef USE_OPENMP - omp_set_lock(&critical_section_lock); - parallel_section_left++; - - /* - Unlock master lock only when all the parallel_sections are already exhausted and one of the thread has completed its OpenBLAS call - otherwise just increment the parallel_section_left - The master lock is only locked when we have exhausted all the parallel_sections, So only unlock it then and otherwise just increment the count - */ - if(parallel_section_left == 1) - omp_unset_lock(&level3_lock); - - omp_unset_lock(&critical_section_lock); - -#elif defined(OS_WINDOWS) - LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock); -#else - pthread_mutex_unlock(&level3_lock); -#endif + blas_level3_thread_leave(); return 0; } diff --git a/driver/level3/level3_thread_lock.c b/driver/level3/level3_thread_lock.c new file mode 100644 index 0000000000..1153f3f3ee --- /dev/null +++ b/driver/level3/level3_thread_lock.c @@ -0,0 +1,124 @@ +/*********************************************************************/ +/* Copyright 2026 The OpenBLAS Project. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE OPENBLAS PROJECT ``AS IS'' */ +/* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT */ +/* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND */ +/* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT */ +/* SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY */ +/* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR */ +/* CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF */ +/* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH */ +/* DAMAGE. */ +/* */ +/*********************************************************************/ + +#include "common.h" + +#ifdef USE_OPENMP + +static omp_lock_t level3_lock, critical_section_lock; +static volatile BLASULONG init_lock = 0; +static volatile BLASULONG omp_lock_initialized = 0; +static volatile BLASULONG parallel_section_left = MAX_PARALLEL_NUMBER; + +static void blas_level3_thread_lock_init(void) +{ + while (omp_lock_initialized == 0) { + blas_lock(&init_lock); + if (omp_lock_initialized == 0) { + omp_init_lock(&level3_lock); + omp_init_lock(&critical_section_lock); + WMB; + omp_lock_initialized = 1; + } + blas_unlock(&init_lock); + } +} + +void blas_level3_thread_enter(void) +{ + blas_level3_thread_lock_init(); + + omp_set_lock(&level3_lock); + omp_set_lock(&critical_section_lock); + + parallel_section_left--; + + if (parallel_section_left != 0) + omp_unset_lock(&level3_lock); + + omp_unset_lock(&critical_section_lock); +} + +void blas_level3_thread_leave(void) +{ + omp_set_lock(&critical_section_lock); + + parallel_section_left++; + + if (parallel_section_left == 1) + omp_unset_lock(&level3_lock); + + omp_unset_lock(&critical_section_lock); +} + +#elif defined(OS_WINDOWS) + +static CRITICAL_SECTION level3_lock; +static volatile BLASULONG init_lock = 0; +static volatile BLASULONG level3_lock_initialized = 0; + +static void blas_level3_thread_lock_init(void) +{ + while (level3_lock_initialized == 0) { + blas_lock(&init_lock); + if (level3_lock_initialized == 0) { + InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); + WMB; + level3_lock_initialized = 1; + } + blas_unlock(&init_lock); + } +} + +void blas_level3_thread_enter(void) +{ + blas_level3_thread_lock_init(); + EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); +} + +void blas_level3_thread_leave(void) +{ + LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock); +} + +#else + +static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; + +void blas_level3_thread_enter(void) +{ + pthread_mutex_lock(&level3_lock); +} + +void blas_level3_thread_leave(void) +{ + pthread_mutex_unlock(&level3_lock); +} + +#endif From 7c7c65e6923c1455548be22957c663c6014e0b7f Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Sun, 14 Jun 2026 17:09:44 -0600 Subject: [PATCH 2/2] fix windows build slowness and test errors --- .github/workflows/dynamic_arch.yml | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 523f2bdb42..6f45556e81 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -417,6 +417,10 @@ jobs: -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=OFF -DBUILD_WITHOUT_LAPACK=ON + -DBUILD_SINGLE=OFF + -DBUILD_DOUBLE=ON + -DBUILD_COMPLEX=OFF + -DBUILD_COMPLEX16=OFF "-DDYNAMIC_ARCH=$dynamic_arch" -DNOFORTRAN=ON -DUSE_THREAD=ON @@ -443,7 +447,9 @@ jobs: cmake "${cmake_args[@]}" .. - name: Build OpenBLAS - run: cd build && cmake --build . + run: | + cd build + cmake --build . --target dgemm_thread_safety dgemm_thread_safety_mixed dgemv_thread_safety - name: Show ccache status continue-on-error: true @@ -535,7 +541,11 @@ jobs: -DBUILD_SHARED_LIBS=ON \ -DBUILD_STATIC_LIBS=OFF \ -DBUILD_WITHOUT_LAPACK=ON \ - -DDYNAMIC_ARCH=ON \ + -DBUILD_SINGLE=OFF \ + -DBUILD_DOUBLE=ON \ + -DBUILD_COMPLEX=OFF \ + -DBUILD_COMPLEX16=OFF \ + -DDYNAMIC_ARCH=OFF \ -DNOFORTRAN=ON \ -DUSE_THREAD=ON \ -DUSE_OPENMP=OFF \ @@ -550,7 +560,9 @@ jobs: .. - name: Build OpenBLAS - run: cd build && cmake --build . + run: | + cd build + cmake --build . --target dgemm_thread_safety dgemm_thread_safety_mixed dgemv_thread_safety - name: Show ccache status continue-on-error: true @@ -567,6 +579,7 @@ jobs: timeout-minutes: 30 run: | cd build + export PATH="$PWD/lib:$PATH" OPENBLAS_NUM_THREADS=8 OMP_NUM_THREADS=16 ctest -R 'dgemm_thread_safety|dgemm_thread_safety_mixed|dgemv_thread_safety' --output-on-failure