From 1820fe27d0eca1df601e0bbedbbc89f36e5536c0 Mon Sep 17 00:00:00 2001 From: Minh Vu Date: Sun, 28 Jun 2026 23:19:48 +0200 Subject: [PATCH 1/2] Fix halo workspace docs and sample link Signed-off-by: Minh Vu --- benchmark/README.md | 4 ++-- docs/api/c_api.rst | 4 ++-- docs/api/f_api.rst | 24 ++++++++++++------------ include/cudecomp.h | 4 ++-- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/benchmark/README.md b/benchmark/README.md index d45bfe5..7750011 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -29,12 +29,12 @@ and `--gz` options define the number of GPUs to run on (should be equivalent to and Z dimensions of the domain to perform the FFT. `--csvfile` is the name of the file for the benchmark runner script to record results. The final positional option is the configuration name (from `benchmark_config.yaml`) to use for the run. -To visualize the benchmark results, a [`plot_heatmaps.py`](heatmap_scripts/plot_heatmaps.py) script to plot heatmaps from the data captured in the csv files. Running the script on the a csv file like the following: +To visualize the benchmark results, use the [`plot_heatmaps.py`](heatmap_scripts/plot_heatmaps.py) script to plot heatmaps from the data captured in the CSV files. Running the script on a CSV file like the following: ``` python plot_heatmaps.py --csvfile benchmark_c2c.dgxa100.8gpu.n1024.csv --output_prefix benchmark_c2c.dgxa100.8gpu.n1024 ``` will generate one or more image files `benchmark_c2c.dgxa100.8gpu.n1024_*.png` that contain heatmap plots, with each file corresponding to a distinct set of options (e.g. precision, axis-contiguous settings, in-place or out-of-place, etc.), which are listed in the plot title. -Several sample csv files and generated heatmap plots for 2048^3 C2C FFTs on a DGX A100 (80GB) system using NVHPC SDK 22.5, can be found in the [samples](heatmap_scripts/samples) directory. +Several sample CSV files and generated heatmap plots for 2048^3 C2C FFTs on a DGX A100 (80GB) system using NVHPC SDK 22.5 can be found in the [sample](heatmap_scripts/sample) directory. We can examine one of the sample plots ([`benchmark_c2c.dgxa100.8gpu.n2048_1.png`](heatmap_scripts/sample/benchmark_c2c.dgxa100.8gpu.n2048_1.png)), shown below, to explain the content. ![heatmap_example](heatmap_scripts/sample/benchmark_c2c.dgxa100.8gpu.n2048_1.png?raw=true) diff --git a/docs/api/c_api.rst b/docs/api/c_api.rst index 7145af8..5f5df68 100644 --- a/docs/api/c_api.rst +++ b/docs/api/c_api.rst @@ -63,7 +63,7 @@ Communication Backends .. _cudecompTransposeCommBackend_t-ref: -cudecompTranposeCommBackend_t +cudecompTransposeCommBackend_t _____________________________ .. doxygenenum :: cudecompTransposeCommBackend_t @@ -233,7 +233,7 @@ _____________________ .. _cudecompTransposeCommBackendToString-ref: -cudecompTranposeCommBackendToString +cudecompTransposeCommBackendToString ___________________________________ .. doxygenfunction:: cudecompTransposeCommBackendToString diff --git a/docs/api/f_api.rst b/docs/api/f_api.rst index eef2c98..6395c7d 100644 --- a/docs/api/f_api.rst +++ b/docs/api/f_api.rst @@ -113,9 +113,9 @@ Communication Backends .. _cudecompTransposeCommBackend_t-f-ref: -cudecompTranposeCommBackend +cudecompTransposeCommBackend _____________________________ -See documention for equivalent C enumerator, :ref:`cudecompTransposeCommBackend_t-ref`. +See documentation for equivalent C enumerator, :ref:`cudecompTransposeCommBackend_t-ref`. ------ @@ -123,7 +123,7 @@ See documention for equivalent C enumerator, :ref:`cudecompTransposeCommBackend_ cudecompHaloCommBackend _________________________ -See documention for equivalent C enumerator, :ref:`cudecompHaloCommBackend_t-ref`. +See documentation for equivalent C enumerator, :ref:`cudecompHaloCommBackend_t-ref`. ------ @@ -134,7 +134,7 @@ Additional Enumerators cudecompDataType __________________ -See documention for equivalent C enumerator, :ref:`cudecompDataType_t-ref`. +See documentation for equivalent C enumerator, :ref:`cudecompDataType_t-ref`. ------ @@ -142,7 +142,7 @@ See documention for equivalent C enumerator, :ref:`cudecompDataType_t-ref`. cudecompAutotuneGridMode __________________________ -See documention for equivalent C enumerator, :ref:`cudecompAutotuneGridMode_t-ref`. +See documentation for equivalent C enumerator, :ref:`cudecompAutotuneGridMode_t-ref`. ------ @@ -150,7 +150,7 @@ See documention for equivalent C enumerator, :ref:`cudecompAutotuneGridMode_t-re cudecompRankOrder __________________ -See documention for equivalent C enumerator, :ref:`cudecompRankOrder_t-ref`. +See documentation for equivalent C enumerator, :ref:`cudecompRankOrder_t-ref`. ------ @@ -158,7 +158,7 @@ See documention for equivalent C enumerator, :ref:`cudecompRankOrder_t-ref`. cudecompResult ________________ -See documention for equivalent C enumerator, :ref:`cudecompResult_t-ref`. +See documentation for equivalent C enumerator, :ref:`cudecompResult_t-ref`. Functions ========================== @@ -273,7 +273,7 @@ _________________________________ Queries the required transpose workspace size, in elements, for a provided grid descriptor. - This function queries the required workspace size, in elements, for transposition communication using a provided grid descriptor. This workspace is required to faciliate local transposition/packing/unpacking operations, or for use as a staging buffer. + This function queries the required workspace size, in elements, for transposition communication using a provided grid descriptor. This workspace is required to facilitate local transposition/packing/unpacking operations, or for use as a staging buffer. :p cudecompHandle handle [in]: The initialized cuDecomp library handle :p cudecompGridDesc grid_desc [in]: A cuDecomp grid descriptor. @@ -288,9 +288,9 @@ cudecompGetHaloWorkspaceSize ____________________________ .. f:function:: cudecompGetHaloWorkspaceSize(handle, grid_desc, axis, halo_extents, workspace_size) - Queries the required transpose workspace size, in elements, for a provided grid descriptor. + Queries the required halo workspace size, in elements, for a provided grid descriptor. - This function queries the required workspace size, in elements, for transposition communication using a provided grid descriptor. This workspace is required to faciliate local transposition/packing/unpacking operations, or for use as a staging buffer. + This function queries the required workspace size, in elements, for halo communication using a provided grid descriptor. This workspace is required to facilitate local packing operations for halo regions that are not contiguous in memory, or for use as a staging buffer. :p cudecompHandle handle [in]: The initialized cuDecomp library handle :p cudecompGridDesc grid_desc [in]: A cuDecomp grid descriptor. @@ -378,14 +378,14 @@ _____________________ .. _cudecompTransposeCommBackendToString-f-ref: -cudecompTranposeCommBackendToString +cudecompTransposeCommBackendToString ___________________________________ .. f:function:: cudecompTransposeCommBackendToString(comm_backend) Function to get string name of transpose communication backend. - :p cudecompTransposeCommBackend comm_backend [in]: A cuDecompTranposeCommBackend value. + :p cudecompTransposeCommBackend comm_backend [in]: A cudecompTransposeCommBackend value. :r character(:) res: A string representation of the transpose communication backend. Will return string “ERROR” if invalid backend value is provided. ------ diff --git a/include/cudecomp.h b/include/cudecomp.h index 0a92966..a9b6003 100644 --- a/include/cudecomp.h +++ b/include/cudecomp.h @@ -322,7 +322,7 @@ cudecompResult_t cudecompGetPencilInfo(cudecompHandle_t handle, cudecompGridDesc /** * @brief Queries the required transpose workspace size, in elements, for a provided grid descriptor. * @details This function queries the required workspace size, in elements, for transposition communication using - * a provided grid descriptor. This workspace is required to faciliate local transposition/packing/unpacking operations, + * a provided grid descriptor. This workspace is required to facilitate local transposition/packing/unpacking operations, * or for use as a staging buffer. * @param[in] handle The initialized cuDecomp library handle * @param[in] grid_desc A cuDecomp grid descriptor @@ -336,7 +336,7 @@ cudecompResult_t cudecompGetTransposeWorkspaceSize(cudecompHandle_t handle, cude /** * @brief Queries the required halo workspace size, in elements, for a provided grid descriptor. * @details This function queries the required workspace size, in elements, for halo communication using - * a provided grid descriptor. This workspace is required to faciliate local packing operations for halo regions that + * a provided grid descriptor. This workspace is required to facilitate local packing operations for halo regions that * are not contiguous in memory, or for use as a staging buffer. * @param[in] handle The initialized cuDecomp library handle * @param[in] grid_desc A cuDecomp grid descriptor From b2c95dd29618096a07fd182118d3313ebb98e0cc Mon Sep 17 00:00:00 2001 From: Minh Vu Date: Tue, 30 Jun 2026 01:23:59 +0200 Subject: [PATCH 2/2] Fix clang-format line wrap Signed-off-by: Minh Vu --- include/cudecomp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cudecomp.h b/include/cudecomp.h index a9b6003..009ae95 100644 --- a/include/cudecomp.h +++ b/include/cudecomp.h @@ -322,8 +322,8 @@ cudecompResult_t cudecompGetPencilInfo(cudecompHandle_t handle, cudecompGridDesc /** * @brief Queries the required transpose workspace size, in elements, for a provided grid descriptor. * @details This function queries the required workspace size, in elements, for transposition communication using - * a provided grid descriptor. This workspace is required to facilitate local transposition/packing/unpacking operations, - * or for use as a staging buffer. + * a provided grid descriptor. This workspace is required to facilitate local transposition/packing/unpacking + * operations, or for use as a staging buffer. * @param[in] handle The initialized cuDecomp library handle * @param[in] grid_desc A cuDecomp grid descriptor * @param[out] workspace_size A pointer to a 64-bit integer to write the workspace size