From dc932cc56bdefce5f81dfa31fecf08fedcd0c4c6 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Tue, 25 Feb 2025 13:57:25 -0800 Subject: [PATCH 01/58] Support BOUT_FOR_RAJA GPU field operators --- CMakeLists.txt | 9 +- include/bout/coordinates_accessor.hxx | 2 +- include/bout/field_accessor.hxx | 40 +++++- include/bout/mesh.hxx | 5 + include/bout/rajalib.hxx | 13 +- include/bout/single_index_ops.hxx | 11 -- src/field/gen_fieldops.jinja | 200 +++++++++++++++++++++----- src/field/gen_fieldops.py | 58 +++++++- src/mesh/coordinates.cxx | 5 +- src/mesh/coordinates_accessor.cxx | 18 ++- 10 files changed, 294 insertions(+), 67 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c45fca3b72..f0a657fe94 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -386,8 +386,15 @@ if (BOUT_GENERATE_FIELDOPS) if (NOT ClangFormat_FOUND) message(FATAL_ERROR "clang-format not found, but you have requested to generate code!") endif() + if (BOUT_ENABLE_RAJA) + set(GEN_LOOP_EXEC "raja") + elseif (BOUT_ENABLE_OPENMP) + set(GEN_LOOP_EXEC "openmp") + else() + set(GEN_LOOP_EXEC "serial") + endif() add_custom_command( OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/src/field/generated_fieldops.cxx - COMMAND ${Python3_EXECUTABLE} gen_fieldops.py --filename generated_fieldops.cxx.tmp + COMMAND ${Python3_EXECUTABLE} gen_fieldops.py --loop-exec ${GEN_LOOP_EXEC} --filename generated_fieldops.cxx.tmp COMMAND ${ClangFormat_BIN} generated_fieldops.cxx.tmp -i COMMAND ${CMAKE_COMMAND} -E rename generated_fieldops.cxx.tmp generated_fieldops.cxx DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/field/gen_fieldops.jinja ${CMAKE_CURRENT_SOURCE_DIR}/src/field/gen_fieldops.py diff --git a/include/bout/coordinates_accessor.hxx b/include/bout/coordinates_accessor.hxx index 532351d57a..2376ab5039 100644 --- a/include/bout/coordinates_accessor.hxx +++ b/include/bout/coordinates_accessor.hxx @@ -31,7 +31,7 @@ /// -> If Coordinates data is changed, the cache should be cleared /// by calling CoordinatesAccessor::clear() struct CoordinatesAccessor { - CoordinatesAccessor() = delete; + CoordinatesAccessor() {} /// Constructor from Coordinates /// Copies data from coords, doesn't modify it diff --git a/include/bout/field_accessor.hxx b/include/bout/field_accessor.hxx index 69b58da979..71d0537d9e 100644 --- a/include/bout/field_accessor.hxx +++ b/include/bout/field_accessor.hxx @@ -57,10 +57,17 @@ struct FieldAccessor { /// Constructor from Field3D /// /// @param[in] f The field to access. Must already be allocated - explicit FieldAccessor(FieldType& f) : coords(f.getCoordinates()) { + explicit FieldAccessor(FieldType& f) { ASSERT0(f.getLocation() == location); ASSERT0(f.isAllocated()); + if (auto* Coords = f.getCoordinates()) { + coords = CoordinatesAccessor{Coords}; + } + else { + coords = CoordinatesAccessor{}; + } + data = BoutRealArray{&f(0, 0, 0)}; // Field size @@ -81,15 +88,19 @@ struct FieldAccessor { ddt = BoutRealArray{&(f.timeDeriv()->operator()(0, 0, 0))}; } + explicit FieldAccessor(const FieldType& f) : FieldAccessor(const_cast(f)) {} + /// Provide shorthand for access to field data. /// Does not convert between 3D and 2D indices, /// so fa[i] is equivalent to fa.data[i]. /// BOUT_HOST_DEVICE inline const BoutReal& operator[](int ind) const { return data[ind]; } + BOUT_HOST_DEVICE inline BoutReal& operator[](int ind) { return data[ind]; } BOUT_HOST_DEVICE inline const BoutReal& operator[](const Ind3D& ind) const { return data[ind.ind]; } + BOUT_HOST_DEVICE inline BoutReal& operator[](const Ind3D& ind) { return data[ind.ind]; } // Pointers to the field data arrays // These are wrapped in BoutRealArray types so they can be indexed with Ind3D or int @@ -115,6 +126,9 @@ struct FieldAccessor { template using Field2DAccessor = FieldAccessor; +template +using Field3DAccessor = FieldAccessor; + /// Syntactic sugar for time derivative of a field /// /// Usage: @@ -130,4 +144,28 @@ BOUT_HOST_DEVICE inline BoutRealArray& ddt(const FieldAccessor(fa.ddt); } +struct FieldPerpAccessor { + FieldPerpAccessor() = delete; + + int nx, nz; + int yindex; + BoutReal* data; + + explicit FieldPerpAccessor(const FieldPerp& f) { + ASSERT0(f.isAllocated()); + + data = BoutRealArray{const_cast(&f(0, 0, 0))}; + + // Field size + nx = f.getNx(); + nz = f.getNz(); + + yindex = f.getIndex(); + } + + BOUT_HOST_DEVICE int getIndex() const { return yindex; } + BOUT_HOST_DEVICE inline const BoutReal& operator[](int ind) const { return data[ind]; } + BOUT_HOST_DEVICE inline BoutReal& operator[](int ind) { return data[ind]; } +}; + #endif diff --git a/include/bout/mesh.hxx b/include/bout/mesh.hxx index a1c88a2634..b6553d06ec 100644 --- a/include/bout/mesh.hxx +++ b/include/bout/mesh.hxx @@ -762,6 +762,11 @@ public: return {(indPerp.ind - jz) * LocalNy + LocalNz * jy + jz, LocalNy, LocalNz}; } + BOUT_HOST_DEVICE int flatIndPerpto3D(const int& flatIndPerp, const int nz, int jy = 0) const { + int jz = flatIndPerp % nz; + return (flatIndPerp - jz) * LocalNy + LocalNz * jy + jz; + } + /// Converts an Ind3D to an Ind2D representing a 2D index using a lookup -- to be used with care Ind2D map3Dto2D(const Ind3D& ind3D) { return {indexLookup3Dto2D[ind3D.ind], LocalNy, 1}; diff --git a/include/bout/rajalib.hxx b/include/bout/rajalib.hxx index b9f6913459..92eae68858 100644 --- a/include/bout/rajalib.hxx +++ b/include/bout/rajalib.hxx @@ -23,6 +23,15 @@ #include "RAJA/RAJA.hpp" // using RAJA lib +#if BOUT_HAS_CUDA +// TODO: Make configurable +const int CUDA_BLOCK_SIZE = 256; +using EXEC_POL = RAJA::cuda_exec; +//using EXEC_POL = RAJA::loop_exec; +#else // not BOUT_USE_CUDA +using EXEC_POL = RAJA::loop_exec; +#endif // end BOUT_USE_CUDA + /// Wrapper around RAJA::forall /// Enables computations to be done on CPU or GPU (CUDA). /// @@ -81,7 +90,7 @@ struct RajaForAll { // Note: must be a local variable const int* _ob_i_ind_raw = &_ob_i_ind[0]; RAJA::forall(RAJA::RangeSegment(0, _ob_i_ind.size()), - [=] RAJA_DEVICE(int id) { + [=] RAJA_DEVICE(int id) mutable { // Look up index and call user function f(_ob_i_ind_raw[id]); }); @@ -127,7 +136,7 @@ private: /// to create variables which shadow the class members. /// #define BOUT_FOR_RAJA(index, region, ...) \ - RajaForAll(region) << [ =, ##__VA_ARGS__ ] RAJA_DEVICE(int index) +RajaForAll(region) << [ =, ##__VA_ARGS__ ] RAJA_DEVICE(int index) mutable #else // BOUT_HAS_RAJA diff --git a/include/bout/single_index_ops.hxx b/include/bout/single_index_ops.hxx index 60bd78bc36..c29d1a471f 100644 --- a/include/bout/single_index_ops.hxx +++ b/include/bout/single_index_ops.hxx @@ -7,17 +7,6 @@ #include "field_accessor.hxx" -#if BOUT_HAS_RAJA -//-- RAJA CUDA settings--------------------------------------------------------start -#if BOUT_HAS_CUDA -const int CUDA_BLOCK_SIZE = 256; // TODO: Make configurable -using EXEC_POL = RAJA::cuda_exec; -#else // not BOUT_USE_CUDA -using EXEC_POL = RAJA::loop_exec; -#endif // end BOUT_USE_CUDA -////-----------CUDA settings------------------------------------------------------end -#endif // end BOUT_HAS_RAJA - // Ind3D: i.zp(): BOUT_HOST_DEVICE inline int i_zp(const int id, const int nz) { int jz = id % nz; diff --git a/src/field/gen_fieldops.jinja b/src/field/gen_fieldops.jinja index ecd4e628cc..60f9cbbd7e 100644 --- a/src/field/gen_fieldops.jinja +++ b/src/field/gen_fieldops.jinja @@ -8,6 +8,26 @@ checkData({{lhs.name}}); checkData({{rhs.name}}); + {% if (region_loop == "BOUT_FOR_RAJA") %} + {% if out.field_type == "FieldPerp" %} + auto {{out.name}}_acc = FieldPerpAccessor{ {{out.name}} }; + {% else %} + auto {{out.name}}_acc = FieldAccessor({{out.name}}); + {% endif %} + {% if lhs.field_type == "FieldPerp" %} + auto {{lhs.name}}_acc = FieldPerpAccessor{ {{lhs.name}} }; + {% elif lhs.field_type == "BoutReal" %} + {% else %} + auto {{lhs.name}}_acc = FieldAccessor({{lhs.name}}); + {% endif %} + {% if rhs.field_type == "FieldPerp" %} + auto {{rhs.name}}_acc = FieldPerpAccessor{ {{rhs.name}} }; + {% elif rhs.field_type == "BoutReal" %} + {% else %} + auto {{rhs.name}}_acc = FieldAccessor({{rhs.name}}); + {% endif %} + {% endif %} + {% if out == "Field3D" %} {% if lhs == rhs == "Field3D" %} {{out.name}}.setRegion({{lhs.name}}.getMesh()->getCommonRegion({{lhs.name}}.getRegionID(), @@ -20,45 +40,98 @@ {% endif %} {% if (out == "Field3D") and ((lhs == "Field2D") or (rhs =="Field2D")) %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + int mesh_nz = {{lhs.name if lhs.field_type != "BoutReal" else rhs.name}}_acc.mesh_nz; + {% else %} Mesh *localmesh = {{lhs.name if lhs.field_type != "BoutReal" else rhs.name}}.getMesh(); + {% endif %} {% if (lhs == "Field2D") %} {{region_loop}}({{index_var}}, {{lhs.name}}.getRegion({{region_name}})) { {% else %} {{region_loop}}({{index_var}}, {{rhs.name}}.getRegion({{region_name}})) { {% endif %} - const auto {{mixed_base_ind}} = localmesh->ind2Dto3D({{index_var}}); - {% if (operator == "/") and (rhs == "Field2D") %} - const auto tmp = 1.0 / {{rhs.mixed_index}}; - for (int {{jz_var}} = 0; {{jz_var}} < localmesh->LocalNz; ++{{jz_var}}){ - {{out.mixed_index}} = {{lhs.mixed_index}} * tmp; + {% if (region_loop == "BOUT_FOR_RAJA") %} + const auto {{mixed_base_ind}} = {{index_var}} * mesh_nz; + {% else %} + const auto {{mixed_base_ind}} = localmesh->ind2Dto3D({{index_var}}); + {% endif %} + {% if (operator == "/") and (rhs == "Field2D") %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + const auto tmp = 1.0 / {{rhs.mixed_index_acc}}; {% else %} - for (int {{jz_var}} = 0; {{jz_var}} < localmesh->LocalNz; ++{{jz_var}}){ - {{out.mixed_index}} = {{lhs.mixed_index}} {{operator}} {{rhs.mixed_index}}; + const auto tmp = 1.0 / {{rhs.mixed_index}}; {% endif %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + for (int {{jz_var}} = 0; {{jz_var}} < mesh_nz; ++{{jz_var}}){ + {% else %} + for (int {{jz_var}} = 0; {{jz_var}} < localmesh->LocalNz; ++{{jz_var}}){ + {% endif %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + {{out.mixed_index_acc}} = {{lhs.mixed_index_acc}} * tmp; + {% else %} + {{out.mixed_index}} = {{lhs.mixed_index}} * tmp; + {% endif %} + {% else %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + for (int {{jz_var}} = 0; {{jz_var}} < mesh_nz; ++{{jz_var}}){ + {% else %} + for (int {{jz_var}} = 0; {{jz_var}} < localmesh->LocalNz; ++{{jz_var}}){ + {% endif %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + {{out.mixed_index_acc}} = {{lhs.mixed_index_acc}} {{operator}} {{rhs.mixed_index_acc}}; + {% else %} + {{out.mixed_index}} = {{lhs.mixed_index}} {{operator}} {{rhs.mixed_index}}; + {% endif %} + {% endif %} } - } + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% elif out == "FieldPerp" and (lhs == "Field2D" or lhs == "Field3D" or rhs == "Field2D" or rhs == "Field3D")%} Mesh *localmesh = {{lhs.name if lhs.field_type != "BoutReal" else rhs.name}}.getMesh(); {{region_loop}}({{index_var}}, {{out.name}}.getRegion({{region_name}})) { - int yind = {{lhs.name if lhs == "FieldPerp" else rhs.name}}.getIndex(); - const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); + {% if (region_loop == "BOUT_FOR_RAJA") %} + int yind = {{lhs.name if lhs == "FieldPerp" else rhs.name}}_acc.getIndex(); + {% else %} + int yind = {{lhs.name if lhs == "FieldPerp" else rhs.name}}.getIndex(); + {% endif %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + ; // DONE2 + const auto {{mixed_base_ind}} = localmesh->flatIndPerpto3D({{index_var}}, result_acc.nz, yind); + {% else %} + const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); + {% endif %} {% if lhs != "FieldPerp" %} - {{out.index}} = {{lhs.base_index}} {{operator}} {{rhs.index}}; + {% if (region_loop == "BOUT_FOR_RAJA") %} + {{out.index_acc}} = {{lhs.base_index_acc}} {{operator}} {{rhs.index_acc}}; + {% else %} + {{out.index}} = {{lhs.base_index}} {{operator}} {{rhs.index}}; + {% endif %} {% else %} - {{out.index}} = {{lhs.index}} {{operator}} {{rhs.base_index}}; + {% if (region_loop == "BOUT_FOR_RAJA") %} + {{out.index_acc}} = {{lhs.index_acc}} {{operator}} {{rhs.base_index_acc}}; + {% else %} + {{out.index}} = {{lhs.index}} {{operator}} {{rhs.base_index}}; + {% endif %} {% endif %} - } + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% elif (operator == "/") and (rhs == "BoutReal") %} const auto tmp = 1.0 / {{rhs.index}}; {{region_loop}}({{index_var}}, {{out.name}}.getValidRegionWithDefault({{region_name}})) { - {{out.index}} = {{lhs.index}} * tmp; - } + {% if (region_loop == "BOUT_FOR_RAJA") %} + {{out.index_acc}} = {{lhs.index_acc}} * tmp; + {% else %} + {{out.index}} = {{lhs.index}} * tmp; + {% endif %} + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% else %} {{region_loop}}({{index_var}}, {{out.name}}.getValidRegionWithDefault({{region_name}})) { - {{out.index}} = {{lhs.index}} {{operator}} {{rhs.index}}; - } + {% if (region_loop == "BOUT_FOR_RAJA") %} + {{out.index_acc}} = {{lhs.index_acc}} {{operator}} {{rhs.index_acc}}; + {% else %} + {{out.index}} = {{lhs.index}} {{operator}} {{rhs.index}}; + {% endif %} + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% endif %} checkData({{out.name}}); @@ -84,49 +157,102 @@ checkData(*this); checkData({{rhs.name}}); + {% if (region_loop == "BOUT_FOR_RAJA") %} + {% if lhs.field_type == "FieldPerp" %} + auto this_acc = FieldPerpAccessor{(*this)}; + {% else %} + auto this_acc = FieldAccessor(*this); + {% endif %} + {% if rhs.field_type == "FieldPerp" %} + auto {{rhs.name}}_acc = FieldPerpAccessor{ {{rhs.name}} }; + {% elif rhs.field_type == "BoutReal" %} + {% else %} + auto {{rhs.name}}_acc = FieldAccessor({{rhs.name}}); + {% endif %} + {% endif %} + {% if lhs == rhs == "Field3D" %} regionID = fieldmesh->getCommonRegion(regionID, {{rhs.name}}.regionID); {% endif %} - {% if (lhs == "Field3D") and (rhs =="Field2D") %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + int mesh_nz = fieldmesh->LocalNz; + {% endif %} {{region_loop}}({{index_var}}, {{rhs.name}}.getRegion({{region_name}})) { - const auto {{mixed_base_ind}} = fieldmesh->ind2Dto3D({{index_var}}); - {% if (operator == "/") and (rhs == "Field2D") %} - const auto tmp = 1.0 / {{rhs.mixed_index}}; - for (int {{jz_var}} = 0; {{jz_var}} < fieldmesh->LocalNz; ++{{jz_var}}){ - (*this)[{{mixed_base_ind}} + {{jz_var}}] *= tmp; + {% if (region_loop == "BOUT_FOR_RAJA") %} + const auto {{mixed_base_ind}} = {{index_var}} * mesh_nz; + {% else %} + const auto {{mixed_base_ind}} = fieldmesh->ind2Dto3D({{index_var}}); + {% endif %} + {% if (operator == "/") and (rhs == "Field2D") %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + const auto tmp = 1.0 / {{rhs.mixed_index_acc}}; + for (int {{jz_var}} = 0; {{jz_var}} < mesh_nz; ++{{jz_var}}){ + this_acc[{{mixed_base_ind}} + {{jz_var}}] *= tmp; {% else %} - for (int {{jz_var}} = 0; {{jz_var}} < fieldmesh->LocalNz; ++{{jz_var}}){ - (*this)[{{mixed_base_ind}} + {{jz_var}}] {{operator}}= {{rhs.index}}; + const auto tmp = 1.0 / {{rhs.mixed_index}}; + for (int {{jz_var}} = 0; {{jz_var}} < fieldmesh->LocalNz; ++{{jz_var}}){ + (*this)[{{mixed_base_ind}} + {{jz_var}}] *= tmp; {% endif %} + {% else %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + for (int {{jz_var}} = 0; {{jz_var}} < mesh_nz; ++{{jz_var}}){ + this_acc[{{mixed_base_ind}} + {{jz_var}}] {{operator}}= {{rhs.index_acc}}; + {% else %} + for (int {{jz_var}} = 0; {{jz_var}} < fieldmesh->LocalNz; ++{{jz_var}}){ + (*this)[{{mixed_base_ind}} + {{jz_var}}] {{operator}}= {{rhs.index}}; + {% endif %} + {% endif %} } - } + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% elif lhs == "FieldPerp" and (rhs == "Field3D" or rhs == "Field2D")%} Mesh *localmesh = this->getMesh(); + {% if (region_loop == "BOUT_FOR_RAJA") %} + int yind = this->getIndex(); + {% endif %} {{region_loop}}({{index_var}}, this->getRegion({{region_name}})) { - int yind = this->getIndex(); - const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); - (*this)[{{index_var}}] {{operator}}= {{rhs.base_index}}; - } + {% if (region_loop == "BOUT_FOR_RAJA") %} + const auto {{mixed_base_ind}} = localmesh->flatIndPerpto3D({{index_var}}, yind); + this_acc[{{index_var}}] {{operator}}= {{rhs.base_index_acc}}; + {% else %} + int yind = this->getIndex(); + const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); + (*this)[{{index_var}}] {{operator}}= {{rhs.base_index}}; + {% endif %} + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% elif rhs == "FieldPerp" and (lhs == "Field3D" or lhs == "Field2D")%} Mesh *localmesh = this->getMesh(); {{region_loop}}({{index_var}}, {{rhs.name}}.getRegion({{region_name}})) { - int yind = {{rhs.name}}.getIndex(); - const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); - (*this)[{{base_ind_var}}] {{operator}}= {{rhs.index}}; - } + {% if (region_loop == "BOUT_FOR_RAJA") %} + int yind = {{rhs.name}}.getIndex(); + const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); + this_acc[{{base_ind_var}}] {{operator}}= {{rhs.index}}; + {% else %} + int yind = {{rhs.name}}.getIndex(); + const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); + (*this)[{{base_ind_var}}] {{operator}}= {{rhs.index}}; + {% endif %} + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% elif (operator == "/") and (lhs == "Field3D" or lhs == "Field2D") and (rhs =="BoutReal") %} const auto tmp = 1.0 / {{rhs.index}}; {{region_loop}}({{index_var}}, this->getRegion({{region_name}})) { + {% if (region_loop == "BOUT_FOR_RAJA") %} + this_acc[{{index_var}}] *= tmp; + {% else %} (*this)[{{index_var}}] *= tmp; - } + {% endif %} + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% else %} {{region_loop}}({{index_var}}, this->getRegion({{region_name}})) { - (*this)[{{index_var}}] {{operator}}= {{rhs.index}}; - } + {% if (region_loop == "BOUT_FOR_RAJA") %} + this_acc[{{index_var}}] {{operator}}= {{rhs.index_acc}}; + {% else %} + (*this)[{{index_var}}] {{operator}}= {{rhs.index}}; + {% endif %} + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% endif %} checkData(*this); diff --git a/src/field/gen_fieldops.py b/src/field/gen_fieldops.py index 29631ff7aa..bf06a8ea5c 100755 --- a/src/field/gen_fieldops.py +++ b/src/field/gen_fieldops.py @@ -132,6 +132,17 @@ def index(self): else: return "{self.name}[{self.index_var}]".format(self=self) + @property + def index_acc(self): + """Returns "_acc[{index_var}]" for an accessor-based index, except if + field_type is BoutReal, in which case just returns "" + + """ + if self.field_type == "BoutReal": + return "{self.name}".format(self=self) + else: + return "{self.name}_acc[{self.index_var}]".format(self=self) + @property def mixed_index(self): """Returns "[{index_var} + {jz_var}]" if field_type is Field3D, @@ -147,6 +158,21 @@ def mixed_index(self): else: # Field2D return "{self.name}[{self.index_var}]".format(self=self) + @property + def mixed_index_acc(self): + """Returns "_acc[{index_var} + {jz_var}]" for an accessor if field_type + is Field3D, self.index if Field2D or just returns "" for BoutReal + + """ + if self.field_type == "BoutReal": + return "{self.name}_acc".format(self=self) + elif self.field_type == "Field3D": + return "{self.name}_acc[{self.mixed_base_ind_var} + {self.jz_var}]".format( + self=self + ) + else: # Field2D + return "{self.name}_acc[{self.index_var}]".format(self=self) + @property def base_index(self): """Returns "[{mixed_base_ind_var}]" if field_type is Field3D, Field2D or FieldPerp @@ -158,6 +184,18 @@ def base_index(self): else: return "{self.name}[{self.mixed_base_ind_var}]".format(self=self) + @property + def base_index_acc(self): + """Returns "_acc[{mixed_base_ind_var}]" for an accessor if field_type is + Field3D, Field2D or FieldPerp or just returns "" for BoutReal + + """ + if self.field_type == "BoutReal": + return "{self.name}".format(self=self) + else: + return "{self.name}_acc[{self.mixed_base_ind_var}]".format(self=self) + + def __eq__(self, other): try: return self.field_type == other.field_type @@ -198,11 +236,11 @@ def returnType(f1, f2): ) # By default use OpenMP enabled loops but allow to disable parser.add_argument( - "--no-openmp", - action="store_false", - default=False, - dest="noOpenMP", - help="Don't use OpenMP compatible loops", + "--loop-exec", + default="openmp", + dest="loop_exec", + choices=["serial", "openmp", "raja"], + help="Choose the loop execution method. Default is OpenMP", ) args = parser.parse_args() @@ -213,10 +251,16 @@ def returnType(f1, f2): mixed_base_ind_var = "base_ind" region_name = '"RGN_ALL"' - if args.noOpenMP: + if args.loop_exec == "openmp": + region_loop = "BOUT_FOR" + elif args.loop_exec == "raja": + region_loop = "BOUT_FOR_RAJA" + header += "#include \n" + header += "#include \n" + elif args.loop_exec == "serial": region_loop = "BOUT_FOR_SERIAL" else: - region_loop = "BOUT_FOR" + raise ValueError("Unknown loop execution method") # Declare what fields we currently support: # Field perp is currently missing diff --git a/src/mesh/coordinates.cxx b/src/mesh/coordinates.cxx index 34c524d1e7..8123720144 100644 --- a/src/mesh/coordinates.cxx +++ b/src/mesh/coordinates.cxx @@ -23,6 +23,8 @@ #include "parallel/fci.hxx" #include "parallel/shiftedmetricinterp.hxx" +#include "bout/coordinates_accessor.hxx" + // use anonymous namespace so this utility function is not available outside this file namespace { template @@ -1203,10 +1205,11 @@ int Coordinates::geometry(bool recalculate_staggered, localmesh->recalculateStaggeredCoordinates(); } - // Invalidate and recalculate cached variables + // Invalidate and recalculate cached variables and any accessor zlength_cache.reset(); Grad2_par2_DDY_invSgCache.clear(); invSgCache.reset(); + CoordinatesAccessor::clear(this); return 0; } diff --git a/src/mesh/coordinates_accessor.cxx b/src/mesh/coordinates_accessor.cxx index aff546c2b0..196234d999 100644 --- a/src/mesh/coordinates_accessor.cxx +++ b/src/mesh/coordinates_accessor.cxx @@ -40,8 +40,9 @@ CoordinatesAccessor::CoordinatesAccessor(const Coordinates* coords) { // Copy data from Coordinates variable into data array // Uses the symbol to look up the corresponding Offset -#define COPY_STRIPE1(symbol) \ - data[stripe_size * ind.ind + static_cast(Offset::symbol)] = coords->symbol[ind]; +#define COPY_STRIPE1(symbol) \ + if (coords->symbol.isAllocated()) \ + data[stripe_size * ind.ind + static_cast(Offset::symbol)] = coords->symbol[ind]; // Implement copy for each argument #define COPY_STRIPE(...) \ @@ -54,10 +55,15 @@ CoordinatesAccessor::CoordinatesAccessor(const Coordinates* coords) { COPY_STRIPE(d1_dx, d1_dy, d1_dz); COPY_STRIPE(J); - data[stripe_size * ind.ind + static_cast(Offset::B)] = coords->Bxy[ind]; - data[stripe_size * ind.ind + static_cast(Offset::Byup)] = coords->Bxy.yup()[ind]; - data[stripe_size * ind.ind + static_cast(Offset::Bydown)] = - coords->Bxy.ydown()[ind]; + if (coords->Bxy.isAllocated()) { + data[stripe_size * ind.ind + static_cast(Offset::B)] = coords->Bxy[ind]; + if (coords->Bxy.yup().isAllocated()) + data[stripe_size * ind.ind + static_cast(Offset::Byup)] = + coords->Bxy.yup()[ind]; + if (coords->Bxy.ydown().isAllocated()) + data[stripe_size * ind.ind + static_cast(Offset::Bydown)] = + coords->Bxy.ydown()[ind]; + } COPY_STRIPE(G1, G3); COPY_STRIPE(g11, g12, g13, g22, g23, g33); From b83d2b5286ab5f56e4ca257bf86dc952f84daf57 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Mon, 26 May 2025 18:57:31 -0700 Subject: [PATCH 02/58] Working WIP --- include/bout/field3d.hxx | 5 +- include/bout/field_accessor.hxx | 1 + include/bout/fieldops.hxx | 67 +++++++++++++++ include/bout/rajalib.hxx | 15 ++++ include/bout/vector3d.hxx | 1 + src/field/field3d.cxx | 3 +- src/field/generated_fieldops.cxx | 86 +++++++++++++++++-- src/field/vecops.cxx | 1 + .../laplace/impls/naulin/naulin_laplace.cxx | 1 + src/sys/derivs.cxx | 1 + 10 files changed, 173 insertions(+), 8 deletions(-) create mode 100644 include/bout/fieldops.hxx diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index a75e38df36..6d0624ff73 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -38,6 +38,7 @@ class Field3D; #include class Mesh; +class BinaryExpr; /// Class for 3D X-Y-Z scalar fields /*! @@ -183,6 +184,7 @@ public: Field3D(Array data, Mesh* localmesh, CELL_LOC location = CELL_CENTRE, DirectionTypes directions_in = {YDirectionType::Standard, ZDirectionType::Standard}); + Field3D(const BinaryExpr& expr); /// Destructor ~Field3D() override; @@ -424,6 +426,7 @@ public: /// return void, as only part initialised void operator=(const FieldPerp& rhs); Field3D& operator=(BoutReal val); + Field3D& operator=(BinaryExpr expr); ///@} /// Addition operators @@ -518,7 +521,7 @@ FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator*(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); -Field3D operator+(const Field3D& lhs, const Field3D& rhs); +BinaryExpr operator+(const Field3D& lhs, const Field3D& rhs); Field3D operator-(const Field3D& lhs, const Field3D& rhs); Field3D operator*(const Field3D& lhs, const Field3D& rhs); Field3D operator/(const Field3D& lhs, const Field3D& rhs); diff --git a/include/bout/field_accessor.hxx b/include/bout/field_accessor.hxx index 71d0537d9e..a43420d6b3 100644 --- a/include/bout/field_accessor.hxx +++ b/include/bout/field_accessor.hxx @@ -96,6 +96,7 @@ struct FieldAccessor { /// BOUT_HOST_DEVICE inline const BoutReal& operator[](int ind) const { return data[ind]; } BOUT_HOST_DEVICE inline BoutReal& operator[](int ind) { return data[ind]; } + __device__ inline BoutReal operator()(int i) const { return data[i]; } BOUT_HOST_DEVICE inline const BoutReal& operator[](const Ind3D& ind) const { return data[ind.ind]; diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx new file mode 100644 index 0000000000..b80aaec446 --- /dev/null +++ b/include/bout/fieldops.hxx @@ -0,0 +1,67 @@ +#pragma once +#ifndef BOUT_FIELDOPS_HXX +#define BOUT_FIELDOPS_HXX + +#include "bout/bout_types.hxx" +#include "bout/field_accessor.hxx" + +struct Add { + __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a + b; } +}; +struct Sub { + __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a - b; } +}; +struct Mul { + __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a * b; } +}; +struct Div { + __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a / b; } +}; + +struct BinaryExpr { + struct RegionIndices { + int* data; + int size; + + RegionIndices(int n) : size(n) { + cudaMallocManaged(&data, n * sizeof(int)); + for (int i = 0; i < n; ++i) + data[i] = 0; + } + ~RegionIndices() { cudaFree(data); } + + __device__ inline int operator()(int idx) const { return data[idx]; } + }; + + using FieldType = FieldAccessor; + + FieldType lhs; + FieldType rhs; + RegionIndices indices; + Add op; + + Mesh* mesh; + CELL_LOC location = CELL_CENTRE; + DirectionTypes directions; + + template + BinaryExpr(FieldType lhs, FieldType rhs, Mesh* mesh, CELL_LOC location, + DirectionTypes directions, const Region& region) + : lhs(lhs), rhs(rhs), mesh(mesh), location(location), directions(directions), + indices(region.getIndices().size()) { + // Copy the region indices into the managed array + for (int i = 0; i < indices.size; ++i) { + indices.data[i] = region.getIndices()[i].ind; + } + } + + __host__ __device__ inline int getSize() const { return indices.size; } + __device__ inline int regionIdx(int idx) const { return indices(idx); } + __device__ inline BoutReal operator()(int idx) const { return op(lhs(idx), rhs(idx)); } + + Mesh* getMesh() const { return mesh; } + CELL_LOC getLocation() const { return location; } + DirectionTypes getDirections() const { return directions; } +}; + +#endif // BOUT_EXPRESSION_HXX \ No newline at end of file diff --git a/include/bout/rajalib.hxx b/include/bout/rajalib.hxx index 92eae68858..b3da46da50 100644 --- a/include/bout/rajalib.hxx +++ b/include/bout/rajalib.hxx @@ -14,6 +14,7 @@ */ #pragma once +#include "bout/array.hxx" #ifndef RAJALIB_H #define RAJALIB_H @@ -138,6 +139,20 @@ private: #define BOUT_FOR_RAJA(index, region, ...) \ RajaForAll(region) << [ =, ##__VA_ARGS__ ] RAJA_DEVICE(int index) mutable +// NEW STUFF + +template +__global__ void evaluator(BoutReal *out, Expr &expr) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + for (int i = tid; i < expr.getSize(); i += stride) { + out[expr.regionIdx(i)] = expr(expr.regionIdx(i)); // single‐pass fusion + } +} + +// END OF NEW STUFF + + #else // BOUT_HAS_RAJA #warning RAJA not enabled. BOUT_FOR_RAJA falling back to BOUT_FOR. diff --git a/include/bout/vector3d.hxx b/include/bout/vector3d.hxx index 0c71dcffa5..ad68dc17ee 100644 --- a/include/bout/vector3d.hxx +++ b/include/bout/vector3d.hxx @@ -36,6 +36,7 @@ class Vector3D; class Field2D; class Vector2D; #include "bout/field3d.hxx" +#include "bout/fieldops.hxx" /*! * Represents a 3D vector, with x,y,z components diff --git a/src/field/field3d.cxx b/src/field/field3d.cxx index 0d2bc0694e..9ea488d8f1 100644 --- a/src/field/field3d.cxx +++ b/src/field/field3d.cxx @@ -805,7 +805,8 @@ bool operator==(const Field3D& a, const Field3D& b) { if (!a.isAllocated() || !b.isAllocated()) { return false; } - return min(abs(a - b)) < 1e-10; + Field3D Sub = a - b; + return min(Sub) < 1e-10; } std::ostream& operator<<(std::ostream& out, const Field3D& value) { diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index 6b778acee3..b1b99caaa9 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -1,11 +1,73 @@ // This file is autogenerated - see gen_fieldops.py +#include "bout/rajalib.hxx" +#include "bout/fieldops.hxx" + #include #include +#include #include #include #include #include +template +struct ExprFor { + using type = T; +}; + +template <> +struct ExprFor { + using type = FieldAccessor; +}; + +template +using ExprFor_t = typename ExprFor>::type; + +//template +//class ExpressionExpr : public ExprBase { +//private: +// const Expression& lhs; +// const Expression& rhs; +// Op op; +// +//public: +// ExpressionExpr(const Expression& lhs, const Expression& rhs, Op op) +// : lhs(lhs), rhs(rhs), op(op), +// ExprBase(lhs.getMesh(), lhs.getLocation(), lhs.getDirections()) {} +// +// __device__ BoutReal operator()(int idx) const override { +// return op(lhs(idx), rhs(idx)); +// } +// +// __host__ __device__ int getSize() const override { +// return lhs.getSize(); // Assume same size +// } +// +// __device__ int regionIdx(int idx) const override { +// return lhs.regionIdx(idx); // Use lhs indexing +// } +//}; + +Field3D& Field3D::operator=(BinaryExpr expr) { + constexpr int THREADS = 256; + int blocks = (size() + THREADS - 1) / THREADS; + + // one kernel launch that writes each element exactly once + evaluator<<>>(&data[0], expr); + cudaDeviceSynchronize(); + return *this; +} +// +Field3D::Field3D(const BinaryExpr& expr) { + Array data{expr.getSize()}; + + constexpr int THREADS = 256; + int blocks = (expr.getSize() + THREADS - 1) / THREADS; + evaluator<<>>(&data[0], expr); + cudaDeviceSynchronize(); + *this = Field3D{data, expr.getMesh(), expr.getLocation(), expr.getDirections()}; +} + // Provide the C++ wrapper for multiplication of Field3D and Field3D Field3D operator*(const Field3D& lhs, const Field3D& rhs) { ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); @@ -21,6 +83,8 @@ Field3D operator*(const Field3D& lhs, const Field3D& rhs) { } checkData(result); + std::cout << "operator*\n"; + getchar(); return result; } @@ -65,6 +129,8 @@ Field3D operator/(const Field3D& lhs, const Field3D& rhs) { } checkData(result); + std::cout << "operator/\n"; + getchar(); return result; } @@ -95,7 +161,7 @@ Field3D& Field3D::operator/=(const Field3D& rhs) { } // Provide the C++ wrapper for addition of Field3D and Field3D -Field3D operator+(const Field3D& lhs, const Field3D& rhs) { +BinaryExpr operator+(const Field3D& lhs, const Field3D& rhs) { ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(lhs)}; @@ -104,12 +170,18 @@ Field3D operator+(const Field3D& lhs, const Field3D& rhs) { result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] + rhs[index]; - } + std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; + return BinaryExpr{static_cast>(lhs), + static_cast>(rhs), + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + result.getValidRegionWithDefault("RGN_ALL")}; - checkData(result); - return result; + //constexpr int THREADS = 256; + //int blocks = (BE.getSize() + THREADS - 1) / THREADS; + //evaluator<<>>(&result(0, 0, 0), BE); + //return result; } // Provide the C++ operator to update Field3D by addition with Field3D @@ -152,6 +224,8 @@ Field3D operator-(const Field3D& lhs, const Field3D& rhs) { result[index] = lhs[index] - rhs[index]; } + std::cout << "operator-\n"; + getchar(); checkData(result); return result; } diff --git a/src/field/vecops.cxx b/src/field/vecops.cxx index 5f34e2af02..9b1105e7aa 100644 --- a/src/field/vecops.cxx +++ b/src/field/vecops.cxx @@ -28,6 +28,7 @@ #include #include +#include #include #include #include diff --git a/src/invert/laplace/impls/naulin/naulin_laplace.cxx b/src/invert/laplace/impls/naulin/naulin_laplace.cxx index e6f68d850d..74ec68dae9 100644 --- a/src/invert/laplace/impls/naulin/naulin_laplace.cxx +++ b/src/invert/laplace/impls/naulin/naulin_laplace.cxx @@ -142,6 +142,7 @@ #include #include #include +#include #include #include #include diff --git a/src/sys/derivs.cxx b/src/sys/derivs.cxx index ee9bcbcc2c..55e2c77a29 100644 --- a/src/sys/derivs.cxx +++ b/src/sys/derivs.cxx @@ -49,6 +49,7 @@ #include +#include #include #include From 970870958f390587c1c1dff9ae3ee1493aa74c8a Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Tue, 27 May 2025 17:12:14 -0700 Subject: [PATCH 03/58] WIP 2 - Compiles but crashes. I suspect it's because nested operators copy Field3D on the device (rhs) --- include/bout/array.hxx | 2 +- include/bout/field3d.hxx | 52 +++++++++++++++++++-- include/bout/fieldops.hxx | 79 ++++++++++++++++++++++++++----- src/field/generated_fieldops.cxx | 80 +++++++++----------------------- 4 files changed, 140 insertions(+), 73 deletions(-) diff --git a/include/bout/array.hxx b/include/bout/array.hxx index 2c42f15aad..b83c29c51d 100644 --- a/include/bout/array.hxx +++ b/include/bout/array.hxx @@ -66,7 +66,7 @@ struct ArrayData { #if BOUT_HAS_UMPIRE auto& rm = umpire::ResourceManager::getInstance(); #if BOUT_HAS_CUDA - auto allocator = rm.getAllocator(umpire::resource::Pinned); + auto allocator = rm.getAllocator(umpire::resource::Unified); #else auto allocator = rm.getAllocator("HOST"); #endif diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 6d0624ff73..b03730ddfa 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -38,7 +38,10 @@ class Field3D; #include class Mesh; -class BinaryExpr; + +//template +//class BinaryExpr; +#include "bout/fieldops.hxx" /// Class for 3D X-Y-Z scalar fields /*! @@ -184,7 +187,15 @@ public: Field3D(Array data, Mesh* localmesh, CELL_LOC location = CELL_CENTRE, DirectionTypes directions_in = {YDirectionType::Standard, ZDirectionType::Standard}); - Field3D(const BinaryExpr& expr); + template + Field3D(const BinaryExpr& expr) { + Array data{expr.getSize()}; + constexpr int THREADS = 256; + int blocks = (expr.getSize() + THREADS - 1) / THREADS; + evaluatorExpr<<>>(&data[0], expr); + cudaDeviceSynchronize(); + *this = Field3D{data, expr.getMesh(), expr.getLocation(), expr.getDirections()}; + } /// Destructor ~Field3D() override; @@ -415,6 +426,15 @@ public: return &data[(jx * ny + jy) * nz]; } + struct View { + BoutReal* data; + __device__ inline BoutReal operator()(int idx) const { return data[idx]; } + }; + operator View() { return View{&data[0]}; } + + __device__ inline BoutReal operator()(int i) { return View()(i); } + __device__ inline BoutReal operator()(int i) const { return View()(i); } + ///////////////////////////////////////////////////////// // Operators @@ -426,7 +446,15 @@ public: /// return void, as only part initialised void operator=(const FieldPerp& rhs); Field3D& operator=(BoutReal val); - Field3D& operator=(BinaryExpr expr); + template + Field3D& operator=(BinaryExpr expr) { + constexpr int THREADS = 256; + int blocks = (expr.getSize() + THREADS - 1) / THREADS; + evaluatorExpr<<>>(&data[0], expr); + cudaDeviceSynchronize(); + return *this; + } + ///@} /// Addition operators @@ -521,7 +549,23 @@ FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator*(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); -BinaryExpr operator+(const Field3D& lhs, const Field3D& rhs); +template && is_expr_v>> +BinaryExpr operator+(const L& lhs, const R& rhs) { + auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); + + std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; + return BinaryExpr{lhs, + rhs, + BinaryExpr::Op::ADD, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; +} + Field3D operator-(const Field3D& lhs, const Field3D& rhs); Field3D operator*(const Field3D& lhs, const Field3D& rhs); Field3D operator/(const Field3D& lhs, const Field3D& rhs); diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index b80aaec446..7cdb339854 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -3,7 +3,14 @@ #define BOUT_FIELDOPS_HXX #include "bout/bout_types.hxx" -#include "bout/field_accessor.hxx" + +#include +#include + +class Mesh; +class Field3D; + +#include struct Add { __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a + b; } @@ -18,7 +25,18 @@ struct Div { __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a / b; } }; +template +__global__ static void evaluatorExpr(BoutReal* out, Expr& expr) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + for (int i = tid; i < expr.getSize(); i += stride) { + out[expr.regionIdx(i)] = expr(expr.regionIdx(i)); // single‐pass fusion + } +} + +template struct BinaryExpr { + enum class Op { ADD, SUB, MUL, DIV }; struct RegionIndices { int* data; int size; @@ -33,22 +51,22 @@ struct BinaryExpr { __device__ inline int operator()(int idx) const { return data[idx]; } }; - using FieldType = FieldAccessor; - - FieldType lhs; - FieldType rhs; + L lhs; + R rhs; RegionIndices indices; - Add op; + Op op; Mesh* mesh; CELL_LOC location = CELL_CENTRE; DirectionTypes directions; + std::optional regionID; template - BinaryExpr(FieldType lhs, FieldType rhs, Mesh* mesh, CELL_LOC location, - DirectionTypes directions, const Region& region) - : lhs(lhs), rhs(rhs), mesh(mesh), location(location), directions(directions), - indices(region.getIndices().size()) { + BinaryExpr(L lhs, R rhs, Op op, Mesh* mesh, CELL_LOC location, + DirectionTypes directions, std::optional regionID, + const Region& region) + : lhs(lhs), rhs(rhs), op(op), mesh(mesh), location(location), + directions(directions), regionID(regionID), indices(region.getIndices().size()) { // Copy the region indices into the managed array for (int i = 0; i < indices.size; ++i) { indices.data[i] = region.getIndices()[i].ind; @@ -57,11 +75,50 @@ struct BinaryExpr { __host__ __device__ inline int getSize() const { return indices.size; } __device__ inline int regionIdx(int idx) const { return indices(idx); } - __device__ inline BoutReal operator()(int idx) const { return op(lhs(idx), rhs(idx)); } + __device__ inline BoutReal operator()(int idx) const { + switch (op) { + case Op::ADD: + return Add{}(lhs(idx), rhs(idx)); + case Op::SUB: + return Sub{}(lhs(idx), rhs(idx)); + case Op::MUL: + return Mul{}(lhs(idx), rhs(idx)); + case Op::DIV: + return Div{}(lhs(idx), rhs(idx)); + } + } + + void evaluate(BoutReal* data) const {} Mesh* getMesh() const { return mesh; } CELL_LOC getLocation() const { return location; } DirectionTypes getDirections() const { return directions; } + std::optional getRegionID() const { return regionID; }; }; +//template +//struct Expr { +// using type = T; +//}; +// +//template <> +//struct Expr { +// using type = Field3D::View; +//}; + +// 1) detect our BinaryExpr template +template +struct is_binary_expr : std::false_type {}; +template +struct is_binary_expr> : std::true_type {}; + +// 2) detect “any subclass of Field” +// assuming Field is your common base class +template +constexpr bool is_field_v = std::is_base_of>::value; + +// 3) combine into “is one of our expression types” +template +constexpr bool is_expr_v = is_field_v || is_binary_expr>::value; + #endif // BOUT_EXPRESSION_HXX \ No newline at end of file diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index b1b99caaa9..2c414c8dde 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -10,45 +10,9 @@ #include #include -template -struct ExprFor { - using type = T; -}; - -template <> -struct ExprFor { - using type = FieldAccessor; -}; - -template -using ExprFor_t = typename ExprFor>::type; - -//template -//class ExpressionExpr : public ExprBase { -//private: -// const Expression& lhs; -// const Expression& rhs; -// Op op; -// -//public: -// ExpressionExpr(const Expression& lhs, const Expression& rhs, Op op) -// : lhs(lhs), rhs(rhs), op(op), -// ExprBase(lhs.getMesh(), lhs.getLocation(), lhs.getDirections()) {} -// -// __device__ BoutReal operator()(int idx) const override { -// return op(lhs(idx), rhs(idx)); -// } -// -// __host__ __device__ int getSize() const override { -// return lhs.getSize(); // Assume same size -// } -// -// __device__ int regionIdx(int idx) const override { -// return lhs.regionIdx(idx); // Use lhs indexing -// } -//}; - -Field3D& Field3D::operator=(BinaryExpr expr) { +#if 0 +template +Field3D& Field3D::operator=(BinaryExpr expr) { constexpr int THREADS = 256; int blocks = (size() + THREADS - 1) / THREADS; @@ -57,8 +21,12 @@ Field3D& Field3D::operator=(BinaryExpr expr) { cudaDeviceSynchronize(); return *this; } -// -Field3D::Field3D(const BinaryExpr& expr) { +template Field3D& + Field3D::operator= (BinaryExpr expr); +#endif + +#if 0 +Field3D::Field3D(const BinaryExpr& expr) { Array data{expr.getSize()}; constexpr int THREADS = 256; @@ -67,6 +35,7 @@ Field3D::Field3D(const BinaryExpr& expr) { cudaDeviceSynchronize(); *this = Field3D{data, expr.getMesh(), expr.getLocation(), expr.getDirections()}; } +#endif // Provide the C++ wrapper for multiplication of Field3D and Field3D Field3D operator*(const Field3D& lhs, const Field3D& rhs) { @@ -83,8 +52,6 @@ Field3D operator*(const Field3D& lhs, const Field3D& rhs) { } checkData(result); - std::cout << "operator*\n"; - getchar(); return result; } @@ -129,8 +96,6 @@ Field3D operator/(const Field3D& lhs, const Field3D& rhs) { } checkData(result); - std::cout << "operator/\n"; - getchar(); return result; } @@ -160,29 +125,32 @@ Field3D& Field3D::operator/=(const Field3D& rhs) { return *this; } +#if 0 // Provide the C++ wrapper for addition of Field3D and Field3D -BinaryExpr operator+(const Field3D& lhs, const Field3D& rhs) { +template +BinaryExpr operator+(const L& lhs, const R& rhs) { ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(lhs)}; checkData(lhs); checkData(rhs); - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); + auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{static_cast>(lhs), - static_cast>(rhs), + return BinaryExpr{lhs, + rhs, + BinaryExpr::Op::ADD, lhs.getMesh(), lhs.getLocation(), lhs.getDirections(), - result.getValidRegionWithDefault("RGN_ALL")}; - - //constexpr int THREADS = 256; - //int blocks = (BE.getSize() + THREADS - 1) / THREADS; - //evaluator<<>>(&result(0, 0, 0), BE); - //return result; + regionID, + (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; } +template BinaryExpr operator+ (const Field3D& lhs, + const Field3D& rhs); +#endif // Provide the C++ operator to update Field3D by addition with Field3D Field3D& Field3D::operator+=(const Field3D& rhs) { @@ -224,8 +192,6 @@ Field3D operator-(const Field3D& lhs, const Field3D& rhs) { result[index] = lhs[index] - rhs[index]; } - std::cout << "operator-\n"; - getchar(); checkData(result); return result; } From e20da9d606a4eaee7581ad3d5ae6ac54f6e7a382 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Tue, 27 May 2025 19:24:41 -0700 Subject: [PATCH 04/58] Working - WIP 3 - Uses Views to avoid copying uncopyable stuff --- include/bout/field3d.hxx | 17 +++++++++------- include/bout/fieldops.hxx | 41 ++++++++++++++++++++++++++------------- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index b03730ddfa..ca62885fdf 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -192,7 +192,8 @@ public: Array data{expr.getSize()}; constexpr int THREADS = 256; int blocks = (expr.getSize() + THREADS - 1) / THREADS; - evaluatorExpr<<>>(&data[0], expr); + evaluatorExpr<<>>( + &data[0], static_cast::View>(expr)); cudaDeviceSynchronize(); *this = Field3D{data, expr.getMesh(), expr.getLocation(), expr.getDirections()}; } @@ -427,10 +428,11 @@ public: } struct View { - BoutReal* data; + const BoutReal* data; __device__ inline BoutReal operator()(int idx) const { return data[idx]; } }; operator View() { return View{&data[0]}; } + operator View() const { return View{&data[0]}; } __device__ inline BoutReal operator()(int i) { return View()(i); } __device__ inline BoutReal operator()(int i) const { return View()(i); } @@ -450,7 +452,8 @@ public: Field3D& operator=(BinaryExpr expr) { constexpr int THREADS = 256; int blocks = (expr.getSize() + THREADS - 1) / THREADS; - evaluatorExpr<<>>(&data[0], expr); + evaluatorExpr<<>>( + &data[0], static_cast::View>(expr)); cudaDeviceSynchronize(); return *this; } @@ -551,13 +554,13 @@ FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); template && is_expr_v>> -BinaryExpr operator+(const L& lhs, const R& rhs) { +BinaryExpr operator+(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{lhs, - rhs, - BinaryExpr::Op::ADD, + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + BinaryExpr::Op::ADD, lhs.getMesh(), lhs.getLocation(), lhs.getDirections(), diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 7cdb339854..359177f3fc 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -26,7 +26,7 @@ struct Div { }; template -__global__ static void evaluatorExpr(BoutReal* out, Expr& expr) { +__global__ static void evaluatorExpr(BoutReal* out, const Expr& expr) { int tid = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; for (int i = tid; i < expr.getSize(); i += stride) { @@ -73,20 +73,33 @@ struct BinaryExpr { } } - __host__ __device__ inline int getSize() const { return indices.size; } - __device__ inline int regionIdx(int idx) const { return indices(idx); } - __device__ inline BoutReal operator()(int idx) const { - switch (op) { - case Op::ADD: - return Add{}(lhs(idx), rhs(idx)); - case Op::SUB: - return Sub{}(lhs(idx), rhs(idx)); - case Op::MUL: - return Mul{}(lhs(idx), rhs(idx)); - case Op::DIV: - return Div{}(lhs(idx), rhs(idx)); + __host__ inline int getSize() const { return indices.size; } + + struct View { + L lhs; + R rhs; + int* indices; + int size; + Op op; + + __host__ __device__ inline int getSize() const { return size; } + __device__ inline int regionIdx(int idx) const { return indices[idx]; } + __device__ inline BoutReal operator()(int idx) const { + switch (op) { + case Op::ADD: + return Add{}(lhs(idx), rhs(idx)); + case Op::SUB: + return Sub{}(lhs(idx), rhs(idx)); + case Op::MUL: + return Mul{}(lhs(idx), rhs(idx)); + case Op::DIV: + return Div{}(lhs(idx), rhs(idx)); + } } - } + }; + + operator View() { return View{lhs, rhs, indices.data, indices.size, op}; } + operator View() const { return View{lhs, rhs, indices.data, indices.size, op}; } void evaluate(BoutReal* data) const {} From 096f57637216f9133d69a06b6aa8a8b6bdd82f7b Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Tue, 27 May 2025 23:54:55 -0700 Subject: [PATCH 05/58] WIP 3 - operatorors +=, -=, *=, /= are working --- include/bout/field3d.hxx | 105 ++++++++++++--- include/bout/fieldops.hxx | 7 +- src/field/generated_fieldops.cxx | 214 +------------------------------ 3 files changed, 96 insertions(+), 230 deletions(-) diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index ca62885fdf..a299f5bfdc 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -190,11 +190,7 @@ public: template Field3D(const BinaryExpr& expr) { Array data{expr.getSize()}; - constexpr int THREADS = 256; - int blocks = (expr.getSize() + THREADS - 1) / THREADS; - evaluatorExpr<<>>( - &data[0], static_cast::View>(expr)); - cudaDeviceSynchronize(); + expr.evaluate(&data[0]); *this = Field3D{data, expr.getMesh(), expr.getLocation(), expr.getDirections()}; } /// Destructor @@ -450,11 +446,7 @@ public: Field3D& operator=(BoutReal val); template Field3D& operator=(BinaryExpr expr) { - constexpr int THREADS = 256; - int blocks = (expr.getSize() + THREADS - 1) / THREADS; - evaluatorExpr<<>>( - &data[0], static_cast::View>(expr)); - cudaDeviceSynchronize(); + expr.evaluate(&data[0]); return *this; } @@ -462,28 +454,56 @@ public: /// Addition operators ///@{ - Field3D& operator+=(const Field3D& rhs); + //Field3D& operator+=(const Field3D& rhs); + template >> + Field3D& operator+=(const R& rhs) { + printf("Running operator+= with CUDA\n"); + data.ensureUnique(); + (*this) = (*this) + rhs; + return *this; + } Field3D& operator+=(const Field2D& rhs); Field3D& operator+=(BoutReal rhs); ///@} /// Subtraction operators ///@{ - Field3D& operator-=(const Field3D& rhs); + //Field3D& operator-=(const Field3D& rhs); + template >> + Field3D& operator-=(const R& rhs) { + printf("Running operator-= with CUDA\n"); + data.ensureUnique(); + (*this) = (*this) - rhs; + return *this; + } Field3D& operator-=(const Field2D& rhs); Field3D& operator-=(BoutReal rhs); ///@} /// Multiplication operators ///@{ - Field3D& operator*=(const Field3D& rhs); + //Field3D& operator*=(const Field3D& rhs); + template >> + Field3D& operator*=(const R& rhs) { + printf("Running operator*= with CUDA\n"); + data.ensureUnique(); + (*this) = (*this) * rhs; + return *this; + } Field3D& operator*=(const Field2D& rhs); Field3D& operator*=(BoutReal rhs); ///@} /// Division operators ///@{ - Field3D& operator/=(const Field3D& rhs); + template >> + Field3D& operator/=(const R& rhs) { + printf("Running operator/= with CUDA\n"); + data.ensureUnique(); + (*this) = (*this) * rhs; + return *this; + } + //Field3D& operator/=(const Field3D& rhs); Field3D& operator/=(const Field2D& rhs); Field3D& operator/=(BoutReal rhs); ///@} @@ -546,6 +566,10 @@ private: // Non-member overloaded operators +template +constexpr bool always_false = false; + + // Binary operators FieldPerp operator+(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs); @@ -569,9 +593,56 @@ BinaryExpr operator+(const L& lhs, const R& : lhs.getMesh()->getRegion("RGN_ALL"))}; } -Field3D operator-(const Field3D& lhs, const Field3D& rhs); -Field3D operator*(const Field3D& lhs, const Field3D& rhs); -Field3D operator/(const Field3D& lhs, const Field3D& rhs); +template && is_expr_v>> +BinaryExpr operator-(const L& lhs, const R& rhs) { + auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); + + std::cout << "RUNNING operator- using BinaryExpr with CUDA" << "\n"; + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + BinaryExpr::Op::SUB, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; +} + +template && is_expr_v>> +BinaryExpr operator*(const L& lhs, const R& rhs) { + auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); + + std::cout << "RUNNING operator* using BinaryExpr with CUDA" << "\n"; + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + BinaryExpr::Op::MUL, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; +} + +template && is_expr_v>> +BinaryExpr operator/(const L& lhs, const R& rhs) { + auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); + + std::cout << "RUNNING operator/ using BinaryExpr with CUDA" << "\n"; + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + BinaryExpr::Op::DIV, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; +} Field3D operator+(const Field3D& lhs, const Field2D& rhs); Field3D operator-(const Field3D& lhs, const Field2D& rhs); diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 359177f3fc..2ef7161f6d 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -101,7 +101,12 @@ struct BinaryExpr { operator View() { return View{lhs, rhs, indices.data, indices.size, op}; } operator View() const { return View{lhs, rhs, indices.data, indices.size, op}; } - void evaluate(BoutReal* data) const {} + void evaluate(BoutReal* data) const { + constexpr int THREADS = 256; + int blocks = (getSize() + THREADS - 1) / THREADS; + evaluatorExpr<<>>(&data[0], static_cast(*this)); + cudaDeviceSynchronize(); + } Mesh* getMesh() const { return mesh; } CELL_LOC getLocation() const { return location; } diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index 2c414c8dde..63cbf9b847 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -10,218 +10,6 @@ #include #include -#if 0 -template -Field3D& Field3D::operator=(BinaryExpr expr) { - constexpr int THREADS = 256; - int blocks = (size() + THREADS - 1) / THREADS; - - // one kernel launch that writes each element exactly once - evaluator<<>>(&data[0], expr); - cudaDeviceSynchronize(); - return *this; -} -template Field3D& - Field3D::operator= (BinaryExpr expr); -#endif - -#if 0 -Field3D::Field3D(const BinaryExpr& expr) { - Array data{expr.getSize()}; - - constexpr int THREADS = 256; - int blocks = (expr.getSize() + THREADS - 1) / THREADS; - evaluator<<>>(&data[0], expr); - cudaDeviceSynchronize(); - *this = Field3D{data, expr.getMesh(), expr.getLocation(), expr.getDirections()}; -} -#endif - -// Provide the C++ wrapper for multiplication of Field3D and Field3D -Field3D operator*(const Field3D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by multiplication with Field3D -Field3D& Field3D::operator*=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) * rhs; - } - return *this; -} - -// Provide the C++ wrapper for division of Field3D and Field3D -Field3D operator/(const Field3D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] / rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by division with Field3D -Field3D& Field3D::operator/=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] /= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) / rhs; - } - return *this; -} - -#if 0 -// Provide the C++ wrapper for addition of Field3D and Field3D -template -BinaryExpr operator+(const L& lhs, const R& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); - - std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{lhs, - rhs, - BinaryExpr::Op::ADD, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; -} -template BinaryExpr operator+ (const Field3D& lhs, - const Field3D& rhs); -#endif - -// Provide the C++ operator to update Field3D by addition with Field3D -Field3D& Field3D::operator+=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] += rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) + rhs; - } - return *this; -} - -// Provide the C++ wrapper for subtraction of Field3D and Field3D -Field3D operator-(const Field3D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] - rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by subtraction with Field3D -Field3D& Field3D::operator-=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] -= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) - rhs; - } - return *this; -} - // Provide the C++ wrapper for multiplication of Field3D and Field2D Field3D operator*(const Field3D& lhs, const Field2D& rhs) { ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); @@ -328,6 +116,7 @@ Field3D& Field3D::operator/=(const Field2D& rhs) { return *this; } +#if 1 // Provide the C++ wrapper for addition of Field3D and Field2D Field3D operator+(const Field3D& lhs, const Field2D& rhs) { ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); @@ -350,6 +139,7 @@ Field3D operator+(const Field3D& lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field3D by addition with Field2D Field3D& Field3D::operator+=(const Field2D& rhs) { From 34cba4d30879dcfbe76e0dfb9bba43af965342d6 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 28 May 2025 00:19:27 -0700 Subject: [PATCH 06/58] WIP 4 - Use functor template parameter for operation --- include/bout/field3d.hxx | 26 +++++++++---------- include/bout/fieldops.hxx | 53 ++++++++++++++++++--------------------- 2 files changed, 36 insertions(+), 43 deletions(-) diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index a299f5bfdc..9696fce77d 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -39,8 +39,6 @@ class Field3D; class Mesh; -//template -//class BinaryExpr; #include "bout/fieldops.hxx" /// Class for 3D X-Y-Z scalar fields @@ -187,8 +185,8 @@ public: Field3D(Array data, Mesh* localmesh, CELL_LOC location = CELL_CENTRE, DirectionTypes directions_in = {YDirectionType::Standard, ZDirectionType::Standard}); - template - Field3D(const BinaryExpr& expr) { + template + Field3D(const BinaryExpr& expr) { Array data{expr.getSize()}; expr.evaluate(&data[0]); *this = Field3D{data, expr.getMesh(), expr.getLocation(), expr.getDirections()}; @@ -444,8 +442,8 @@ public: /// return void, as only part initialised void operator=(const FieldPerp& rhs); Field3D& operator=(BoutReal val); - template - Field3D& operator=(BinaryExpr expr) { + template + Field3D& operator=(BinaryExpr expr) { expr.evaluate(&data[0]); return *this; } @@ -578,13 +576,13 @@ FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); template && is_expr_v>> -BinaryExpr operator+(const L& lhs, const R& rhs) { +BinaryExpr operator+(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; return BinaryExpr{static_cast(lhs), static_cast(rhs), - BinaryExpr::Op::ADD, + bout::op::Add{}, lhs.getMesh(), lhs.getLocation(), lhs.getDirections(), @@ -595,13 +593,13 @@ BinaryExpr operator+(const L& lhs, const R& template && is_expr_v>> -BinaryExpr operator-(const L& lhs, const R& rhs) { +BinaryExpr operator-(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator- using BinaryExpr with CUDA" << "\n"; return BinaryExpr{static_cast(lhs), static_cast(rhs), - BinaryExpr::Op::SUB, + bout::op::Sub{}, lhs.getMesh(), lhs.getLocation(), lhs.getDirections(), @@ -612,13 +610,13 @@ BinaryExpr operator-(const L& lhs, const R& template && is_expr_v>> -BinaryExpr operator*(const L& lhs, const R& rhs) { +BinaryExpr operator*(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator* using BinaryExpr with CUDA" << "\n"; return BinaryExpr{static_cast(lhs), static_cast(rhs), - BinaryExpr::Op::MUL, + bout::op::Mul{}, lhs.getMesh(), lhs.getLocation(), lhs.getDirections(), @@ -629,13 +627,13 @@ BinaryExpr operator*(const L& lhs, const R& template && is_expr_v>> -BinaryExpr operator/(const L& lhs, const R& rhs) { +BinaryExpr operator/(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator/ using BinaryExpr with CUDA" << "\n"; return BinaryExpr{static_cast(lhs), static_cast(rhs), - BinaryExpr::Op::DIV, + bout::op::Div{}, lhs.getMesh(), lhs.getLocation(), lhs.getDirections(), diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 2ef7161f6d..b2052b12a1 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -12,17 +12,21 @@ class Field3D; #include -struct Add { - __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a + b; } -}; -struct Sub { - __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a - b; } -}; -struct Mul { - __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a * b; } +namespace bout { +namespace op { + struct Add { + __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a + b; } + }; + struct Sub { + __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a - b; } + }; + struct Mul { + __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a * b; } + }; + struct Div { + __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a / b; } + }; }; -struct Div { - __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a / b; } }; template @@ -34,7 +38,7 @@ __global__ static void evaluatorExpr(BoutReal* out, const Expr& expr) { } } -template +template struct BinaryExpr { enum class Op { ADD, SUB, MUL, DIV }; struct RegionIndices { @@ -54,7 +58,7 @@ struct BinaryExpr { L lhs; R rhs; RegionIndices indices; - Op op; + Func f; Mesh* mesh; CELL_LOC location = CELL_CENTRE; @@ -62,10 +66,10 @@ struct BinaryExpr { std::optional regionID; template - BinaryExpr(L lhs, R rhs, Op op, Mesh* mesh, CELL_LOC location, + BinaryExpr(L lhs, R rhs, Func f, Mesh* mesh, CELL_LOC location, DirectionTypes directions, std::optional regionID, const Region& region) - : lhs(lhs), rhs(rhs), op(op), mesh(mesh), location(location), + : lhs(lhs), rhs(rhs), f(f), mesh(mesh), location(location), directions(directions), regionID(regionID), indices(region.getIndices().size()) { // Copy the region indices into the managed array for (int i = 0; i < indices.size; ++i) { @@ -80,26 +84,17 @@ struct BinaryExpr { R rhs; int* indices; int size; - Op op; + Func f; __host__ __device__ inline int getSize() const { return size; } __device__ inline int regionIdx(int idx) const { return indices[idx]; } __device__ inline BoutReal operator()(int idx) const { - switch (op) { - case Op::ADD: - return Add{}(lhs(idx), rhs(idx)); - case Op::SUB: - return Sub{}(lhs(idx), rhs(idx)); - case Op::MUL: - return Mul{}(lhs(idx), rhs(idx)); - case Op::DIV: - return Div{}(lhs(idx), rhs(idx)); - } + f(lhs(idx), rhs(idx)); // single‐pass fusion } }; - operator View() { return View{lhs, rhs, indices.data, indices.size, op}; } - operator View() const { return View{lhs, rhs, indices.data, indices.size, op}; } + operator View() { return View{lhs, rhs, indices.data, indices.size, f}; } + operator View() const { return View{lhs, rhs, indices.data, indices.size, f}; } void evaluate(BoutReal* data) const { constexpr int THREADS = 256; @@ -127,8 +122,8 @@ struct BinaryExpr { // 1) detect our BinaryExpr template template struct is_binary_expr : std::false_type {}; -template -struct is_binary_expr> : std::true_type {}; +template +struct is_binary_expr> : std::true_type {}; // 2) detect “any subclass of Field” // assuming Field is your common base class From 7d75b9db770db6ab7a30d3b7c27151e817e82086 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 28 May 2025 00:29:57 -0700 Subject: [PATCH 07/58] Used managed array for indices --- include/bout/fieldops.hxx | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index b2052b12a1..2ea6ac44a6 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -41,23 +41,9 @@ __global__ static void evaluatorExpr(BoutReal* out, const Expr& expr) { template struct BinaryExpr { enum class Op { ADD, SUB, MUL, DIV }; - struct RegionIndices { - int* data; - int size; - - RegionIndices(int n) : size(n) { - cudaMallocManaged(&data, n * sizeof(int)); - for (int i = 0; i < n; ++i) - data[i] = 0; - } - ~RegionIndices() { cudaFree(data); } - - __device__ inline int operator()(int idx) const { return data[idx]; } - }; - L lhs; R rhs; - RegionIndices indices; + Array indices; Func f; Mesh* mesh; @@ -72,17 +58,17 @@ struct BinaryExpr { : lhs(lhs), rhs(rhs), f(f), mesh(mesh), location(location), directions(directions), regionID(regionID), indices(region.getIndices().size()) { // Copy the region indices into the managed array - for (int i = 0; i < indices.size; ++i) { - indices.data[i] = region.getIndices()[i].ind; + for (int i = 0; i < indices.size(); ++i) { + indices[i] = region.getIndices()[i].ind; } } - __host__ inline int getSize() const { return indices.size; } + __host__ inline int getSize() const { return indices.size(); } struct View { L lhs; R rhs; - int* indices; + const int* indices; int size; Func f; @@ -93,8 +79,8 @@ struct BinaryExpr { } }; - operator View() { return View{lhs, rhs, indices.data, indices.size, f}; } - operator View() const { return View{lhs, rhs, indices.data, indices.size, f}; } + operator View() { return View{lhs, rhs, &indices[0], indices.size(), f}; } + operator View() const { return View{lhs, rhs, &indices[0], indices.size(), f}; } void evaluate(BoutReal* data) const { constexpr int THREADS = 256; From b8e7e973e0a4a245c1aa1be2933a0c830a2818e6 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 28 May 2025 02:03:25 -0700 Subject: [PATCH 08/58] Better SFINAE for specializations --- include/bout/field3d.hxx | 67 ++++++++++++++++++++++++++++++++------- include/bout/fieldops.hxx | 21 +++++------- 2 files changed, 63 insertions(+), 25 deletions(-) diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 9696fce77d..cd981ff60d 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -40,6 +40,13 @@ class Field3D; class Mesh; #include "bout/fieldops.hxx" +// Base template: nothing is an expression by default +template +struct is_expr_field3d : std::false_type {}; + +// Helper variable template +template +inline constexpr bool is_expr_field3d_v = is_expr_field3d>::value; /// Class for 3D X-Y-Z scalar fields /*! @@ -453,7 +460,7 @@ public: /// Addition operators ///@{ //Field3D& operator+=(const Field3D& rhs); - template >> + template >> Field3D& operator+=(const R& rhs) { printf("Running operator+= with CUDA\n"); data.ensureUnique(); @@ -467,7 +474,7 @@ public: /// Subtraction operators ///@{ //Field3D& operator-=(const Field3D& rhs); - template >> + template >> Field3D& operator-=(const R& rhs) { printf("Running operator-= with CUDA\n"); data.ensureUnique(); @@ -481,7 +488,7 @@ public: /// Multiplication operators ///@{ //Field3D& operator*=(const Field3D& rhs); - template >> + template >> Field3D& operator*=(const R& rhs) { printf("Running operator*= with CUDA\n"); data.ensureUnique(); @@ -494,7 +501,7 @@ public: /// Division operators ///@{ - template >> + template >> Field3D& operator/=(const R& rhs) { printf("Running operator/= with CUDA\n"); data.ensureUnique(); @@ -575,8 +582,9 @@ FieldPerp operator*(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); template && is_expr_v>> -BinaryExpr operator+(const L& lhs, const R& rhs) { + typename = std::enable_if_t && is_expr_field3d_v>> +BinaryExpr operator+(const L& lhs, + const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; @@ -592,8 +600,9 @@ BinaryExpr operator+(const L& } template && is_expr_v>> -BinaryExpr operator-(const L& lhs, const R& rhs) { + typename = std::enable_if_t && is_expr_field3d_v>> +BinaryExpr operator-(const L& lhs, + const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator- using BinaryExpr with CUDA" << "\n"; @@ -609,8 +618,9 @@ BinaryExpr operator-(const L& } template && is_expr_v>> -BinaryExpr operator*(const L& lhs, const R& rhs) { + typename = std::enable_if_t && is_expr_field3d_v>> +BinaryExpr operator*(const L& lhs, + const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator* using BinaryExpr with CUDA" << "\n"; @@ -626,8 +636,9 @@ BinaryExpr operator*(const L& } template && is_expr_v>> -BinaryExpr operator/(const L& lhs, const R& rhs) { + typename = std::enable_if_t && is_expr_field3d_v>> +BinaryExpr operator/(const L& lhs, + const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator/ using BinaryExpr with CUDA" << "\n"; @@ -643,6 +654,22 @@ BinaryExpr operator/(const L& } Field3D operator+(const Field3D& lhs, const Field2D& rhs); +// template && is_expr_field2d_v>> +// BinaryExpr operator+(const L& lhs, const R& rhs) { +// auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); +// +// std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; +// return BinaryExpr{static_cast(lhs), +// static_cast(rhs), +// bout::op::Add{}, +// lhs.getMesh(), +// lhs.getLocation(), +// lhs.getDirections(), +// regionID, +// (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) +// : lhs.getMesh()->getRegion("RGN_ALL"))}; +// } Field3D operator-(const Field3D& lhs, const Field2D& rhs); Field3D operator*(const Field3D& lhs, const Field2D& rhs); Field3D operator/(const Field3D& lhs, const Field2D& rhs); @@ -769,4 +796,20 @@ bool operator==(const Field3D& a, const Field3D& b); /// Output a string describing a Field3D to a stream std::ostream& operator<<(std::ostream& out, const Field3D& value); +// A raw Field3D is an expression leaf +template <> +struct is_expr_field3d : std::true_type {}; +template <> +struct is_expr_field3d : std::true_type {}; + +// Any nested BinaryExpr is an expression iff L is +template +struct is_expr_field3d> + : std::true_type {}; + +//template +//struct is_expr_field3d< typename BinaryExpr::View > +// : std::integral_constant>::value> {}; +// //: is_expr_field3d> {}; + #endif /* BOUT_FIELD3D_H */ diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 2ea6ac44a6..7c7f511cc3 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -40,7 +40,6 @@ __global__ static void evaluatorExpr(BoutReal* out, const Expr& expr) { template struct BinaryExpr { - enum class Op { ADD, SUB, MUL, DIV }; L lhs; R rhs; Array indices; @@ -95,16 +94,7 @@ struct BinaryExpr { std::optional getRegionID() const { return regionID; }; }; -//template -//struct Expr { -// using type = T; -//}; -// -//template <> -//struct Expr { -// using type = Field3D::View; -//}; - +#if 0 // 1) detect our BinaryExpr template template struct is_binary_expr : std::false_type {}; @@ -114,10 +104,15 @@ struct is_binary_expr> : std::true_type {}; // 2) detect “any subclass of Field” // assuming Field is your common base class template -constexpr bool is_field_v = std::is_base_of>::value; +constexpr bool is_field3d_v = std::is_base_of>::value; // 3) combine into “is one of our expression types” template -constexpr bool is_expr_v = is_field_v || is_binary_expr>::value; +constexpr bool is_expr_field3d_v = + is_field3d_v || is_binary_expr>::value; +#endif + +#if 1 +#endif #endif // BOUT_EXPRESSION_HXX \ No newline at end of file From d830f8d781c48921a9f6ce5ec5a519d162c9fa3a Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 28 May 2025 23:15:30 -0700 Subject: [PATCH 09/58] Fix major bug in the binary expr operator() and add operators - Working version --- include/bout/field2d.hxx | 10 + include/bout/field3d.hxx | 163 ++++++++++----- include/bout/fieldops.hxx | 84 ++++---- include/bout/rajalib.hxx | 2 +- include/bout/vector3d.hxx | 1 - src/field/generated_fieldops.cxx | 193 +++++++++++++++++- src/field/vecops.cxx | 1 - .../laplace/impls/naulin/naulin_laplace.cxx | 1 - src/sys/derivs.cxx | 1 - 9 files changed, 348 insertions(+), 108 deletions(-) diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index 92658f1bbf..5f0901ac67 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -276,6 +276,16 @@ public: int size() const override { return nx * ny; }; + struct View { + const BoutReal* data; + __device__ inline BoutReal operator()(int idx) const { return data[idx]; } + }; + operator View() { return View{&data[0]}; } + operator View() const { return View{&data[0]}; } + + __device__ inline BoutReal operator()(int i) { return View()(i); } + __device__ inline BoutReal operator()(int i) const { return View()(i); } + private: /// Internal data array. Handles allocation/freeing of memory Array data; diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index cd981ff60d..c6bc02faf9 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -44,10 +44,16 @@ class Mesh; template struct is_expr_field3d : std::false_type {}; +template +struct is_expr_field2d : std::false_type {}; + // Helper variable template template inline constexpr bool is_expr_field3d_v = is_expr_field3d>::value; +template +inline constexpr bool is_expr_field2d_v = is_expr_field2d>::valuen; + /// Class for 3D X-Y-Z scalar fields /*! This class represents a scalar field defined over the mesh. @@ -194,9 +200,12 @@ public: ZDirectionType::Standard}); template Field3D(const BinaryExpr& expr) { - Array data{expr.getSize()}; + std::cout << "RUNNING constructor from BinaryExpr\n"; + Array data{expr.size()}; expr.evaluate(&data[0]); - *this = Field3D{data, expr.getMesh(), expr.getLocation(), expr.getDirections()}; + *this = std::move(Field3D{std::move(data), expr.getMesh(), expr.getLocation(), + expr.getDirections()}); + setRegion(expr.getRegionID()); } /// Destructor ~Field3D() override; @@ -430,13 +439,12 @@ public: struct View { const BoutReal* data; - __device__ inline BoutReal operator()(int idx) const { return data[idx]; } + __host__ __device__ inline BoutReal operator()(int idx) const { return data[idx]; } + //__device__ inline const BoutReal* operator()() const { return data; } }; operator View() { return View{&data[0]}; } operator View() const { return View{&data[0]}; } - __device__ inline BoutReal operator()(int i) { return View()(i); } - __device__ inline BoutReal operator()(int i) const { return View()(i); } ///////////////////////////////////////////////////////// // Operators @@ -450,7 +458,9 @@ public: void operator=(const FieldPerp& rhs); Field3D& operator=(BoutReal val); template - Field3D& operator=(BinaryExpr expr) { + Field3D& operator=(BinaryExpr& expr) { + std::cout << "RUNNING operator= with CUDA\n"; + regionID = expr.getRegionID(); expr.evaluate(&data[0]); return *this; } @@ -462,9 +472,19 @@ public: //Field3D& operator+=(const Field3D& rhs); template >> Field3D& operator+=(const R& rhs) { - printf("Running operator+= with CUDA\n"); - data.ensureUnique(); - (*this) = (*this) + rhs; + printf("RUNNING operator+= with CUDA\n"); + if (data.unique()) { + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + + auto BE = (*this) + rhs; + regionID = BE.getRegionID(); + BE.evaluate(&data[0]); + } else { + (*this) = (*this) + rhs; + } + return *this; } Field3D& operator+=(const Field2D& rhs); @@ -476,9 +496,18 @@ public: //Field3D& operator-=(const Field3D& rhs); template >> Field3D& operator-=(const R& rhs) { - printf("Running operator-= with CUDA\n"); - data.ensureUnique(); - (*this) = (*this) - rhs; + if (data.unique()) { + printf("RUNNING operator-= with CUDA with BE\n"); + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + auto BE = (*this) - rhs; + BE.evaluate(&data[0]); + } else { + printf("RUNNING operator-= with CUDA with operation\n"); + (*this) = (*this) - rhs; + } + return *this; } Field3D& operator-=(const Field2D& rhs); @@ -490,9 +519,19 @@ public: //Field3D& operator*=(const Field3D& rhs); template >> Field3D& operator*=(const R& rhs) { - printf("Running operator*= with CUDA\n"); - data.ensureUnique(); - (*this) = (*this) * rhs; + printf("RUNNING operator*= with CUDA\n"); + if (data.unique()) { + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + + auto BE = (*this) * rhs; + regionID = BE.getRegionID(); + BE.evaluate(&data[0]); + } else { + (*this) = (*this) * rhs; + } + return *this; } Field3D& operator*=(const Field2D& rhs); @@ -501,14 +540,24 @@ public: /// Division operators ///@{ + //Field3D& operator/=(const Field3D& rhs); template >> Field3D& operator/=(const R& rhs) { - printf("Running operator/= with CUDA\n"); - data.ensureUnique(); - (*this) = (*this) * rhs; + printf("RUNNING operator/= with CUDA\n"); + if (data.unique()) { + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + + auto BE = (*this) / rhs; + regionID = BE.getRegionID(); + BE.evaluate(&data[0]); + } else { + (*this) = (*this) / rhs; + } + return *this; } - //Field3D& operator/=(const Field3D& rhs); Field3D& operator/=(const Field2D& rhs); Field3D& operator/=(BoutReal rhs); ///@} @@ -571,10 +620,9 @@ private: // Non-member overloaded operators -template +template constexpr bool always_false = false; - // Binary operators FieldPerp operator+(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs); @@ -583,13 +631,12 @@ FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); template && is_expr_field3d_v>> -BinaryExpr operator+(const L& lhs, - const R& rhs) { +BinaryExpr operator+(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{static_cast(lhs), - static_cast(rhs), + return BinaryExpr{(lhs), + (rhs), bout::op::Add{}, lhs.getMesh(), lhs.getLocation(), @@ -601,13 +648,12 @@ BinaryExpr operator+(const L& template && is_expr_field3d_v>> -BinaryExpr operator-(const L& lhs, - const R& rhs) { +BinaryExpr operator-(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator- using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{static_cast(lhs), - static_cast(rhs), + return BinaryExpr{(lhs), + (rhs), bout::op::Sub{}, lhs.getMesh(), lhs.getLocation(), @@ -619,13 +665,12 @@ BinaryExpr operator-(const L& template && is_expr_field3d_v>> -BinaryExpr operator*(const L& lhs, - const R& rhs) { +BinaryExpr operator*(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator* using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{static_cast(lhs), - static_cast(rhs), + return BinaryExpr{(lhs), + (rhs), bout::op::Mul{}, lhs.getMesh(), lhs.getLocation(), @@ -637,13 +682,12 @@ BinaryExpr operator*(const L& template && is_expr_field3d_v>> -BinaryExpr operator/(const L& lhs, - const R& rhs) { +BinaryExpr operator/(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator/ using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{static_cast(lhs), - static_cast(rhs), + return BinaryExpr{(lhs), + (rhs), bout::op::Div{}, lhs.getMesh(), lhs.getLocation(), @@ -654,22 +698,26 @@ BinaryExpr operator/(const L& } Field3D operator+(const Field3D& lhs, const Field2D& rhs); -// template && is_expr_field2d_v>> -// BinaryExpr operator+(const L& lhs, const R& rhs) { -// auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); +//template +//auto operator+(const L& lhs, const R& rhs) +// -> std::enable_if_t && is_expr_field2d_v, +// BinaryExpr> { +// static_assert(always_false || always_false, "Hello"); +// auto regionID = lhs.getRegionID(); // -// std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; -// return BinaryExpr{static_cast(lhs), -// static_cast(rhs), -// bout::op::Add{}, -// lhs.getMesh(), -// lhs.getLocation(), -// lhs.getDirections(), -// regionID, -// (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) -// : lhs.getMesh()->getRegion("RGN_ALL"))}; -// } +// std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; +// int mesh_nz = lhs.getMesh()->LocalNz; +// auto LambdaOp = [mesh_nz]() { +// }; +// return BinaryExpr{(lhs), +// (rhs), +// bout::op::Add{}, +// lhs.getMesh(), +// lhs.getLocation(), +// lhs.getDirections(), +// regionID, +// rhs.getRegion("RGN_ALL")}; +//} Field3D operator-(const Field3D& lhs, const Field2D& rhs); Field3D operator*(const Field3D& lhs, const Field2D& rhs); Field3D operator/(const Field3D& lhs, const Field2D& rhs); @@ -713,7 +761,7 @@ void checkData(const Field3D& f, const std::string& region = "RGN_NOBNDRY"); /// Ignored with disabled CHECK; Throw an exception if \p f is not /// allocated or if any elements are non-finite (for CHECK > 2) inline void checkData(const Field3D& UNUSED(f), - const std::string& UNUSED(region) = "RGN_NOBNDRY"){}; + const std::string& UNUSED(region) = "RGN_NOBNDRY") {}; #endif /// Fourier filtering, removes all except one mode @@ -799,13 +847,18 @@ std::ostream& operator<<(std::ostream& out, const Field3D& value); // A raw Field3D is an expression leaf template <> struct is_expr_field3d : std::true_type {}; + template <> -struct is_expr_field3d : std::true_type {}; +struct is_expr_field2d : std::true_type {}; // Any nested BinaryExpr is an expression iff L is +//template +//struct is_expr_field3d> +// : std::true_type {}; + template struct is_expr_field3d> - : std::true_type {}; + : std::integral_constant>::value> {}; //template //struct is_expr_field3d< typename BinaryExpr::View > diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 7c7f511cc3..e9c87f242f 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -15,16 +15,26 @@ class Field3D; namespace bout { namespace op { struct Add { - __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a + b; } + template + __host__ __device__ inline BoutReal operator()(int idx, const LView &L, const RView &R) const { + return L(idx) + R(idx); + } + __host__ __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a + b; } }; struct Sub { - __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a - b; } - }; - struct Mul { - __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a * b; } - }; - struct Div { - __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a / b; } + template + __host__ __device__ inline BoutReal operator()(int idx, const LView &L, const RView &R) const { return L(idx) - R(idx); } + __host__ __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a - b; } + }; + struct Mul { + template + __host__ __device__ inline BoutReal operator()(int idx, const LView &L, const RView &R) const { return L(idx) * R(idx); } + __host__ __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a * b; } + }; + struct Div { + template + __host__ __device__ inline BoutReal operator()(int idx, const LView &L, const RView &R) const { return L(idx) / R(idx); } + __host__ __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a / b; } }; }; }; @@ -33,15 +43,17 @@ template __global__ static void evaluatorExpr(BoutReal* out, const Expr& expr) { int tid = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; - for (int i = tid; i < expr.getSize(); i += stride) { + for (int i = tid; i < expr.size(); i += stride) { out[expr.regionIdx(i)] = expr(expr.regionIdx(i)); // single‐pass fusion } } template struct BinaryExpr { - L lhs; - R rhs; + const L &LHS; + const R &RHS; + typename L::View lhs; + typename R::View rhs; Array indices; Func f; @@ -51,30 +63,36 @@ struct BinaryExpr { std::optional regionID; template - BinaryExpr(L lhs, R rhs, Func f, Mesh* mesh, CELL_LOC location, + BinaryExpr(const L &lhs, const R &rhs, Func f, Mesh* mesh, CELL_LOC location, DirectionTypes directions, std::optional regionID, const Region& region) - : lhs(lhs), rhs(rhs), f(f), mesh(mesh), location(location), - directions(directions), regionID(regionID), indices(region.getIndices().size()) { + : LHS(lhs), RHS(rhs), lhs(static_cast(lhs)), rhs(static_cast(rhs)), + f(f), mesh(mesh), location(location), directions(directions), regionID(regionID), + indices(region.getIndices().size()) { // Copy the region indices into the managed array for (int i = 0; i < indices.size(); ++i) { indices[i] = region.getIndices()[i].ind; } } - __host__ inline int getSize() const { return indices.size(); } + inline int size() const { return indices.size(); } + inline BoutReal operator()(int idx) const { + return f(idx, lhs, rhs); // single‐pass fusion + } + inline int regionIdx(int idx) const { return indices[idx]; } struct View { - L lhs; - R rhs; + typename L::View lhs; + typename R::View rhs; const int* indices; - int size; + int num_indices; Func f; - __host__ __device__ inline int getSize() const { return size; } + __device__ inline int size() const { return num_indices; } __device__ inline int regionIdx(int idx) const { return indices[idx]; } __device__ inline BoutReal operator()(int idx) const { - f(lhs(idx), rhs(idx)); // single‐pass fusion + return f(idx, lhs, rhs); // single‐pass fusion + //return f(lhs(idx), rhs(idx)); // single‐pass fusion } }; @@ -83,9 +101,12 @@ struct BinaryExpr { void evaluate(BoutReal* data) const { constexpr int THREADS = 256; - int blocks = (getSize() + THREADS - 1) / THREADS; + int blocks = (size() + THREADS - 1) / THREADS; evaluatorExpr<<>>(&data[0], static_cast(*this)); cudaDeviceSynchronize(); + //for(int i=0; i getRegionID() const { return regionID; }; }; -#if 0 -// 1) detect our BinaryExpr template -template -struct is_binary_expr : std::false_type {}; -template -struct is_binary_expr> : std::true_type {}; - -// 2) detect “any subclass of Field” -// assuming Field is your common base class -template -constexpr bool is_field3d_v = std::is_base_of>::value; - -// 3) combine into “is one of our expression types” -template -constexpr bool is_expr_field3d_v = - is_field3d_v || is_binary_expr>::value; -#endif - -#if 1 - -#endif #endif // BOUT_EXPRESSION_HXX \ No newline at end of file diff --git a/include/bout/rajalib.hxx b/include/bout/rajalib.hxx index b3da46da50..d61a58e0d8 100644 --- a/include/bout/rajalib.hxx +++ b/include/bout/rajalib.hxx @@ -145,7 +145,7 @@ template __global__ void evaluator(BoutReal *out, Expr &expr) { int tid = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; - for (int i = tid; i < expr.getSize(); i += stride) { + for (int i = tid; i < expr.size(); i += stride) { out[expr.regionIdx(i)] = expr(expr.regionIdx(i)); // single‐pass fusion } } diff --git a/include/bout/vector3d.hxx b/include/bout/vector3d.hxx index ad68dc17ee..0c71dcffa5 100644 --- a/include/bout/vector3d.hxx +++ b/include/bout/vector3d.hxx @@ -36,7 +36,6 @@ class Vector3D; class Field2D; class Vector2D; #include "bout/field3d.hxx" -#include "bout/fieldops.hxx" /*! * Represents a 3D vector, with x,y,z components diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index 63cbf9b847..c35ae2e866 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -1,15 +1,198 @@ // This file is autogenerated - see gen_fieldops.py -#include "bout/rajalib.hxx" -#include "bout/fieldops.hxx" - #include #include -#include #include #include #include #include +// Provide the C++ wrapper for multiplication of Field3D and Field3D +Field3D operator*(const Field3D& lhs, const Field3D& rhs) { + ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); + + Field3D result{emptyFrom(lhs)}; + checkData(lhs); + checkData(rhs); + + result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); + + BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { + result[index] = lhs[index] * rhs[index]; + } + + checkData(result); + return result; +} + +// Provide the C++ operator to update Field3D by multiplication with Field3D +#if 0 +Field3D& Field3D::operator*=(const Field3D& rhs) { + // only if data is unique we update the field + // otherwise just call the non-inplace version + if (data.unique()) { + ASSERT1_FIELDS_COMPATIBLE(*this, rhs); + + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + + checkData(*this); + checkData(rhs); + + regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); + + BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= rhs[index]; } + + checkData(*this); + + } else { + (*this) = (*this) * rhs; + } + return *this; +} +#endif + +// Provide the C++ wrapper for division of Field3D and Field3D +Field3D operator/(const Field3D& lhs, const Field3D& rhs) { + ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); + + Field3D result{emptyFrom(lhs)}; + checkData(lhs); + checkData(rhs); + + result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); + + BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { + result[index] = lhs[index] / rhs[index]; + } + + checkData(result); + return result; +} + +// Provide the C++ operator to update Field3D by division with Field3D +#if 0 +Field3D& Field3D::operator/=(const Field3D& rhs) { + // only if data is unique we update the field + // otherwise just call the non-inplace version + if (data.unique()) { + ASSERT1_FIELDS_COMPATIBLE(*this, rhs); + + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + + checkData(*this); + checkData(rhs); + + regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); + + BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] /= rhs[index]; } + + checkData(*this); + + } else { + (*this) = (*this) / rhs; + } + return *this; +} +#endif + +// Provide the C++ wrapper for addition of Field3D and Field3D +Field3D operator+(const Field3D& lhs, const Field3D& rhs) { + ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); + + Field3D result{emptyFrom(lhs)}; + checkData(lhs); + checkData(rhs); + + result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); + + BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { + result[index] = lhs[index] + rhs[index]; + } + + checkData(result); + return result; +} + +#if 0 +// Provide the C++ operator to update Field3D by addition with Field3D +Field3D& Field3D::operator+=(const Field3D& rhs) { + // only if data is unique we update the field + // otherwise just call the non-inplace version + if (data.unique()) { + ASSERT1_FIELDS_COMPATIBLE(*this, rhs); + + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + + checkData(*this); + checkData(rhs); + + regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); + + BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] += rhs[index]; } + + checkData(*this); + + } else { + (*this) = (*this) + rhs; + } + return *this; +} +#endif + +// Provide the C++ wrapper for subtraction of Field3D and Field3D +Field3D operator-(const Field3D& lhs, const Field3D& rhs) { + ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); + + Field3D result{emptyFrom(lhs)}; + checkData(lhs); + checkData(rhs); + + result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); + + BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { + result[index] = lhs[index] - rhs[index]; + } + + checkData(result); + return result; +} + +// Provide the C++ operator to update Field3D by subtraction with Field3D +#if 0 +Field3D& Field3D::operator-=(const Field3D& rhs) { + // only if data is unique we update the field + // otherwise just call the non-inplace version + if (data.unique()) { + ASSERT1_FIELDS_COMPATIBLE(*this, rhs); + + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + + checkData(*this); + checkData(rhs); + + regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); + + BOUT_FOR(index, this->getRegion("RGN_ALL")) { + (*this)[index] -= rhs[index]; + printf("[golden] val[%d] %lf\n", index, (*this)[index]); + } + + checkData(*this); + + } else { + (*this) = (*this) - rhs; + } + return *this; +} +#endif + // Provide the C++ wrapper for multiplication of Field3D and Field2D Field3D operator*(const Field3D& lhs, const Field2D& rhs) { ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); @@ -116,7 +299,6 @@ Field3D& Field3D::operator/=(const Field2D& rhs) { return *this; } -#if 1 // Provide the C++ wrapper for addition of Field3D and Field2D Field3D operator+(const Field3D& lhs, const Field2D& rhs) { ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); @@ -139,7 +321,6 @@ Field3D operator+(const Field3D& lhs, const Field2D& rhs) { checkData(result); return result; } -#endif // Provide the C++ operator to update Field3D by addition with Field2D Field3D& Field3D::operator+=(const Field2D& rhs) { diff --git a/src/field/vecops.cxx b/src/field/vecops.cxx index 9b1105e7aa..5f34e2af02 100644 --- a/src/field/vecops.cxx +++ b/src/field/vecops.cxx @@ -28,7 +28,6 @@ #include #include -#include #include #include #include diff --git a/src/invert/laplace/impls/naulin/naulin_laplace.cxx b/src/invert/laplace/impls/naulin/naulin_laplace.cxx index 74ec68dae9..e6f68d850d 100644 --- a/src/invert/laplace/impls/naulin/naulin_laplace.cxx +++ b/src/invert/laplace/impls/naulin/naulin_laplace.cxx @@ -142,7 +142,6 @@ #include #include #include -#include #include #include #include diff --git a/src/sys/derivs.cxx b/src/sys/derivs.cxx index 55e2c77a29..ee9bcbcc2c 100644 --- a/src/sys/derivs.cxx +++ b/src/sys/derivs.cxx @@ -49,7 +49,6 @@ #include -#include #include #include From c5f9fd668138b61cbd3ce6542177d538ac7becec Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Thu, 29 May 2025 22:40:53 -0700 Subject: [PATCH 10/58] WIP - More operators --- include/bout/array.hxx | 2 +- include/bout/field2d.hxx | 33 +++- include/bout/field3d.hxx | 254 ++++++++++++++++++---------- include/bout/fieldops.hxx | 157 +++++++++++++---- src/field/generated_fieldops.cxx | 281 +++++++++++-------------------- 5 files changed, 418 insertions(+), 309 deletions(-) diff --git a/include/bout/array.hxx b/include/bout/array.hxx index b83c29c51d..2c42f15aad 100644 --- a/include/bout/array.hxx +++ b/include/bout/array.hxx @@ -66,7 +66,7 @@ struct ArrayData { #if BOUT_HAS_UMPIRE auto& rm = umpire::ResourceManager::getInstance(); #if BOUT_HAS_CUDA - auto allocator = rm.getAllocator(umpire::resource::Unified); + auto allocator = rm.getAllocator(umpire::resource::Pinned); #else auto allocator = rm.getAllocator("HOST"); #endif diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index 5f0901ac67..db2ce194f8 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -277,11 +277,22 @@ public: int size() const override { return nx * ny; }; struct View { - const BoutReal* data; - __device__ inline BoutReal operator()(int idx) const { return data[idx]; } + BoutReal* data; + int mul = 1; + int div = 1; + __device__ inline BoutReal operator()(int idx) const { return data[(idx*mul/div)]; } + __device__ inline BoutReal& operator[](int idx) const { + return data[(idx * mul)/div]; + } + + View& setScale(int mul, int div) { + this->mul = mul; + this->div = div; + return *this; + } }; operator View() { return View{&data[0]}; } - operator View() const { return View{&data[0]}; } + operator View() const { return View{const_cast(&data[0])}; } __device__ inline BoutReal operator()(int i) { return View()(i); } __device__ inline BoutReal operator()(int i) const { return View()(i); } @@ -302,6 +313,22 @@ private: Field2D operator+(const Field2D& lhs, const Field2D& rhs); Field2D operator-(const Field2D& lhs, const Field2D& rhs); Field2D operator*(const Field2D& lhs, const Field2D& rhs); +#if 0 +template && is_expr_field2d_v>> +BinaryExpr operator*(const L& lhs, const R& rhs) { + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + bout::op::Mul{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + lhs.getRegionID(), + (regionID.has_value() + ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; +} +#endif Field2D operator/(const Field2D& lhs, const Field2D& rhs); Field3D operator+(const Field2D& lhs, const Field3D& rhs); diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index c6bc02faf9..33e42cccbb 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -52,7 +52,7 @@ template inline constexpr bool is_expr_field3d_v = is_expr_field3d>::value; template -inline constexpr bool is_expr_field2d_v = is_expr_field2d>::valuen; +inline constexpr bool is_expr_field2d_v = is_expr_field2d>::value; /// Class for 3D X-Y-Z scalar fields /*! @@ -200,7 +200,7 @@ public: ZDirectionType::Standard}); template Field3D(const BinaryExpr& expr) { - std::cout << "RUNNING constructor from BinaryExpr\n"; + //std::cout << "RUNNING constructor from BinaryExpr\n"; Array data{expr.size()}; expr.evaluate(&data[0]); *this = std::move(Field3D{std::move(data), expr.getMesh(), expr.getLocation(), @@ -438,13 +438,30 @@ public: } struct View { - const BoutReal* data; - __host__ __device__ inline BoutReal operator()(int idx) const { return data[idx]; } - //__device__ inline const BoutReal* operator()() const { return data; } + BoutReal* data; + int mul = 1; + int div = 1; + int offset = 0; + __host__ __device__ inline BoutReal operator()(int idx) const { + return data[(idx * mul) / div + offset]; + } + __device__ inline BoutReal& operator[](int idx) const { + return data[(idx * mul) / div + offset]; + } + + View& setScale(int mul, int div) { + this->mul = mul; + this->div = div; + return *this; + } + View& setOffset(int o) { + offset = o; + return *this; + } }; operator View() { return View{&data[0]}; } - operator View() const { return View{&data[0]}; } - + operator View() const { return View{const_cast(&data[0])}; } + //operator View() const { return View{&data[0]}; } ///////////////////////////////////////////////////////// // Operators @@ -461,7 +478,8 @@ public: Field3D& operator=(BinaryExpr& expr) { std::cout << "RUNNING operator= with CUDA\n"; regionID = expr.getRegionID(); - expr.evaluate(&data[0]); + //expr.evaluate(&data[0]); + expr.evaluateWithResult(static_cast(*this)); return *this; } @@ -472,15 +490,17 @@ public: //Field3D& operator+=(const Field3D& rhs); template >> Field3D& operator+=(const R& rhs) { - printf("RUNNING operator+= with CUDA\n"); + //printf("RUNNING operator+= with CUDA\n"); if (data.unique()) { + printf("RUNNING operator+= with CUDA with evaluateWithResult\n"); // Delete existing parallel slices. We don't copy parallel slices, so any // that currently exist will be incorrect. clearParallelSlices(); auto BE = (*this) + rhs; regionID = BE.getRegionID(); - BE.evaluate(&data[0]); + //BE.evaluate(&data[0]); + BE.evaluateWithResult(static_cast(*this)); } else { (*this) = (*this) + rhs; } @@ -497,14 +517,14 @@ public: template >> Field3D& operator-=(const R& rhs) { if (data.unique()) { - printf("RUNNING operator-= with CUDA with BE\n"); + //printf("RUNNING operator-= with CUDA with BE\n"); // Delete existing parallel slices. We don't copy parallel slices, so any // that currently exist will be incorrect. clearParallelSlices(); auto BE = (*this) - rhs; BE.evaluate(&data[0]); } else { - printf("RUNNING operator-= with CUDA with operation\n"); + //printf("RUNNING operator-= with CUDA with operation\n"); (*this) = (*this) - rhs; } @@ -519,7 +539,7 @@ public: //Field3D& operator*=(const Field3D& rhs); template >> Field3D& operator*=(const R& rhs) { - printf("RUNNING operator*= with CUDA\n"); + //printf("RUNNING operator*= with CUDA\n"); if (data.unique()) { // Delete existing parallel slices. We don't copy parallel slices, so any // that currently exist will be incorrect. @@ -541,9 +561,9 @@ public: /// Division operators ///@{ //Field3D& operator/=(const Field3D& rhs); - template >> - Field3D& operator/=(const R& rhs) { - printf("RUNNING operator/= with CUDA\n"); + template + std::enable_if_t,Field3D&> operator/=(const R& rhs) { + //printf("RUNNING operator/= with CUDA\n"); if (data.unique()) { // Delete existing parallel slices. We don't copy parallel slices, so any // that currently exist will be incorrect. @@ -558,7 +578,23 @@ public: return *this; } - Field3D& operator/=(const Field2D& rhs); + //Field3D& operator/=(const Field2D& rhs); + template +std::enable_if_t, Field3D&> operator/=(const R& rhs) { + //printf("RUNNING operator/= with CUDA\n"); + if (data.unique()) { + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + + auto BE = (*this) / rhs; + BE.evaluate(&data[0]); + } else { + (*this) = (*this) / rhs; + } + + return *this; + } Field3D& operator/=(BoutReal rhs); ///@} @@ -634,16 +670,17 @@ template operator+(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); - std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{(lhs), - (rhs), - bout::op::Add{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; + //std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + bout::op::Add{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + (regionID.has_value() + ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; } template operator-(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); - std::cout << "RUNNING operator- using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{(lhs), - (rhs), - bout::op::Sub{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; + //std::cout << "RUNNING operator- using BinaryExpr with CUDA" << "\n"; + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + bout::op::Sub{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + (regionID.has_value() + ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; } -template && is_expr_field3d_v>> -BinaryExpr operator*(const L& lhs, const R& rhs) { +template +std::enable_if_t && is_expr_field3d_v, + BinaryExpr> +operator*(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); - std::cout << "RUNNING operator* using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{(lhs), - (rhs), - bout::op::Mul{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; + //std::cout << "RUNNING operator* using BinaryExpr with CUDA" << "\n"; + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + bout::op::Mul{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + (regionID.has_value() + ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; } template operator/(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); - std::cout << "RUNNING operator/ using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{(lhs), - (rhs), - bout::op::Div{}, + //std::cout << "RUNNING operator/ using BinaryExpr with CUDA" << "\n"; + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + bout::op::Div{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + (regionID.has_value() + ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; +} + +Field3D operator+(const Field3D& lhs, const Field2D& rhs); +#if 0 +template && is_expr_field2d_v, + BinaryExpr>> +BinaryExpr operator+(const L& lhs, const R& rhs) { + //static_assert(always_false || always_false, "Hello"); + auto regionID = lhs.getRegionID(); + + std::cout << "RUNNING Field3D + Field2D using BinaryExpr with CUDA" << "\n"; + int mesh_nz = lhs.getMesh()->LocalNz; + + return BinaryExpr{static_cast(lhs), + static_cast(rhs).setScale(1, mesh_nz), + bout::op::Add{}, lhs.getMesh(), lhs.getLocation(), lhs.getDirections(), regionID, - (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; + rhs.getRegion("RGN_ALL")}; } - -Field3D operator+(const Field3D& lhs, const Field2D& rhs); -//template -//auto operator+(const L& lhs, const R& rhs) -// -> std::enable_if_t && is_expr_field2d_v, -// BinaryExpr> { -// static_assert(always_false || always_false, "Hello"); -// auto regionID = lhs.getRegionID(); -// -// std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; -// int mesh_nz = lhs.getMesh()->LocalNz; -// auto LambdaOp = [mesh_nz]() { -// }; -// return BinaryExpr{(lhs), -// (rhs), -// bout::op::Add{}, -// lhs.getMesh(), -// lhs.getLocation(), -// lhs.getDirections(), -// regionID, -// rhs.getRegion("RGN_ALL")}; -//} +#endif Field3D operator-(const Field3D& lhs, const Field2D& rhs); -Field3D operator*(const Field3D& lhs, const Field2D& rhs); -Field3D operator/(const Field3D& lhs, const Field2D& rhs); +//Field3D operator*(const Field3D& lhs, const Field2D& rhs); +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr> +operator*(const L& lhs, const R& rhs) { + //static_assert(always_false || always_false, "Hello"); + auto regionID = lhs.getRegionID(); + + //std::cout << "RUNNING Field3D * Field2D using BinaryExpr with CUDA" << "\n"; + int mesh_nz = lhs.getMesh()->LocalNz; + + return BinaryExpr{ + static_cast(lhs), + static_cast(rhs).setScale(1, mesh_nz), + bout::op::Mul{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + lhs.getMesh()->getRegion("RGN_ALL")}; +} +//Field3D operator/(const Field3D& lhs, const Field2D& rhs); +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr> +operator/(const L& lhs, const R& rhs) { + //static_assert(always_false || always_false, "Hello"); + auto regionID = lhs.getRegionID(); + + //std::cout << "RUNNING Field3D * Field2D using BinaryExpr with CUDA" << "\n"; + int mesh_nz = lhs.getMesh()->LocalNz; + + return BinaryExpr{ + static_cast(lhs), + static_cast(rhs).setScale(1, mesh_nz), + bout::op::Div{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + lhs.getMesh()->getRegion("RGN_ALL")}; +} Field3D operator+(const Field3D& lhs, BoutReal rhs); Field3D operator-(const Field3D& lhs, BoutReal rhs); @@ -851,15 +933,17 @@ struct is_expr_field3d : std::true_type {}; template <> struct is_expr_field2d : std::true_type {}; -// Any nested BinaryExpr is an expression iff L is -//template -//struct is_expr_field3d> -// : std::true_type {}; - template struct is_expr_field3d> : std::integral_constant>::value> {}; +template +struct is_expr_field2d> + : std::integral_constant>::value> {}; + +//template +//struct is_expr_field3d::View> : is_expr_field3d {}; + //template //struct is_expr_field3d< typename BinaryExpr::View > // : std::integral_constant>::value> {}; diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index e9c87f242f..d83d1eedd1 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -1,4 +1,6 @@ #pragma once +#include "bout/array.hxx" +#include #ifndef BOUT_FIELDOPS_HXX #define BOUT_FIELDOPS_HXX @@ -14,44 +16,96 @@ class Field3D; namespace bout { namespace op { - struct Add { - template - __host__ __device__ inline BoutReal operator()(int idx, const LView &L, const RView &R) const { - return L(idx) + R(idx); +struct Assign { + int scale = 1; + int offset = 0; + template + __device__ void operator()(int idx, BoutReal* out, const Expr& expr) const { + out[(idx * scale) + offset] = expr.lhs(idx) + expr.rhs(idx); + } +}; + +struct Add { + template + __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { + return L(idx) + R(idx); + } + __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { + return a + b; + } +}; + struct Sub { + template + __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { + return L(idx) - R(idx); + } + __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { + return a - b; } - __host__ __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a + b; } }; - struct Sub { - template - __host__ __device__ inline BoutReal operator()(int idx, const LView &L, const RView &R) const { return L(idx) - R(idx); } - __host__ __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a - b; } - }; - struct Mul { - template - __host__ __device__ inline BoutReal operator()(int idx, const LView &L, const RView &R) const { return L(idx) * R(idx); } - __host__ __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a * b; } - }; - struct Div { - template - __host__ __device__ inline BoutReal operator()(int idx, const LView &L, const RView &R) const { return L(idx) / R(idx); } - __host__ __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a / b; } + struct Mul { + template + __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { + return L(idx) * R(idx); + } + __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { + return a * b; + } + }; + struct Div { + template + __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { + return L(idx) / R(idx); + } + __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { + return a / b; + } }; }; }; template -__global__ static void evaluatorExpr(BoutReal* out, const Expr& expr) { +__global__ __launch_bounds__(256) static void evaluatorExpr(BoutReal* out, + const Expr expr) { int tid = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - for (int i = tid; i < expr.size(); i += stride) { - out[expr.regionIdx(i)] = expr(expr.regionIdx(i)); // single‐pass fusion + if (tid >= expr.size()) { + return; // Out of bounds } + int idx = expr.regionIdx(tid); + out[idx] = expr(idx); // single‐pass fusion + //int stride = blockDim.x * gridDim.x; + //for (int i = tid, e = expr.size(); i < e; i += stride) { + // int idx = expr.regionIdx(i); + // out[idx] = expr(idx); // single‐pass fusion + //} } +template +__global__ __launch_bounds__(256) static void evaluatorExprWithResult(Result res, + const Expr expr) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= expr.size()) { + return; // Out of bounds + } + int idx = expr.regionIdx(tid); + res[idx] = expr(idx); // single‐pass fusion + //res(idx, expr(idx)); // single‐pass fusion + //res(idx) = expr(idx); // single‐pass fusion + //int stride = blockDim.x * gridDim.x; + //for (int i = tid, e = expr.size(); i < e; i += stride) { + // int idx = expr.regionIdx(i); + // out[idx] = expr(idx); // single‐pass fusion + //} +} + +inline std::unordered_map> regionIndicesCache; + template struct BinaryExpr { - const L &LHS; - const R &RHS; typename L::View lhs; typename R::View rhs; Array indices; @@ -63,16 +117,31 @@ struct BinaryExpr { std::optional regionID; template - BinaryExpr(const L &lhs, const R &rhs, Func f, Mesh* mesh, CELL_LOC location, - DirectionTypes directions, std::optional regionID, + BinaryExpr(const typename L::View& lhs, const typename R::View& rhs, Func f, Mesh* mesh, + CELL_LOC location, DirectionTypes directions, std::optional regionID, const Region& region) - : LHS(lhs), RHS(rhs), lhs(static_cast(lhs)), rhs(static_cast(rhs)), - f(f), mesh(mesh), location(location), directions(directions), regionID(regionID), - indices(region.getIndices().size()) { + //: lhs(static_cast(lhs)), rhs(static_cast(rhs)), + : lhs(lhs), rhs(rhs), f(f), mesh(mesh), location(location), directions(directions), + regionID(regionID), indices(region.getIndices().size()) { // Copy the region indices into the managed array for (int i = 0; i < indices.size(); ++i) { indices[i] = region.getIndices()[i].ind; } + //if (regionIndicesCache.find(static_cast(const_cast*>(®ion))) + // != regionIndicesCache.end()) { + // // If we have already computed the indices for this region, use them + // indices = + // regionIndicesCache[static_cast(const_cast*>(®ion))]; + //} else { + // // Otherwise, compute the indices and store them in the cache + // indices = Array(region.getIndices().size()); + // // Copy the region indices into the managed array + // for (int i = 0; i < indices.size(); ++i) { + // indices[i] = region.getIndices()[i].ind; + // } + // regionIndicesCache[static_cast(const_cast*>(®ion))] = + // indices; + //} } inline int size() const { return indices.size(); } @@ -87,10 +156,21 @@ struct BinaryExpr { const int* indices; int num_indices; Func f; + int scale = 1; + int offset = 0; - __device__ inline int size() const { return num_indices; } - __device__ inline int regionIdx(int idx) const { return indices[idx]; } - __device__ inline BoutReal operator()(int idx) const { + View& setScale(int s) { + scale = s; + return *this; + } + View& setOffset(int o) { + offset = o; + return *this; + } + + __device__ __forceinline__ int size() const { return num_indices; } + __device__ __forceinline__ int regionIdx(int idx) const { return indices[idx]; } + __device__ __forceinline__ BoutReal operator()(int idx) const { return f(idx, lhs, rhs); // single‐pass fusion //return f(lhs(idx), rhs(idx)); // single‐pass fusion } @@ -109,6 +189,17 @@ struct BinaryExpr { //} } + template + void evaluateWithResult(const Result& res) const { + constexpr int THREADS = 256; + int blocks = (size() + THREADS - 1) / THREADS; + evaluatorExprWithResult<<>>(res, static_cast(*this)); + cudaDeviceSynchronize(); + //for(int i=0; i #include -// Provide the C++ wrapper for multiplication of Field3D and Field3D -Field3D operator*(const Field3D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by multiplication with Field3D -#if 0 -Field3D& Field3D::operator*=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) * rhs; - } - return *this; -} -#endif - -// Provide the C++ wrapper for division of Field3D and Field3D -Field3D operator/(const Field3D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] / rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by division with Field3D -#if 0 -Field3D& Field3D::operator/=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] /= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) / rhs; - } - return *this; -} -#endif - -// Provide the C++ wrapper for addition of Field3D and Field3D -Field3D operator+(const Field3D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] + rhs[index]; - } - - checkData(result); - return result; -} - -#if 0 -// Provide the C++ operator to update Field3D by addition with Field3D -Field3D& Field3D::operator+=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] += rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) + rhs; - } - return *this; -} -#endif - -// Provide the C++ wrapper for subtraction of Field3D and Field3D -Field3D operator-(const Field3D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] - rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by subtraction with Field3D -#if 0 -Field3D& Field3D::operator-=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { - (*this)[index] -= rhs[index]; - printf("[golden] val[%d] %lf\n", index, (*this)[index]); - } - - checkData(*this); - - } else { - (*this) = (*this) - rhs; - } - return *this; -} -#endif - // Provide the C++ wrapper for multiplication of Field3D and Field2D +#if 0 Field3D operator*(const Field3D& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator "<< __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(lhs)}; @@ -215,9 +30,11 @@ Field3D operator*(const Field3D& lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field3D by multiplication with Field2D Field3D& Field3D::operator*=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -245,8 +62,10 @@ Field3D& Field3D::operator*=(const Field2D& rhs) { return *this; } +#if 1 // Provide the C++ wrapper for division of Field3D and Field2D Field3D operator/(const Field3D& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator "<< __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(lhs)}; @@ -268,9 +87,12 @@ Field3D operator/(const Field3D& lhs, const Field2D& rhs) { checkData(result); return result; } +#endif +#if 0 // Provide the C++ operator to update Field3D by division with Field2D Field3D& Field3D::operator/=(const Field2D& rhs) { + std::cout << "RUNNING operator "<< __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -298,9 +120,11 @@ Field3D& Field3D::operator/=(const Field2D& rhs) { } return *this; } +#endif // Provide the C++ wrapper for addition of Field3D and Field2D Field3D operator+(const Field3D& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(lhs)}; @@ -324,6 +148,7 @@ Field3D operator+(const Field3D& lhs, const Field2D& rhs) { // Provide the C++ operator to update Field3D by addition with Field2D Field3D& Field3D::operator+=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -353,6 +178,7 @@ Field3D& Field3D::operator+=(const Field2D& rhs) { // Provide the C++ wrapper for subtraction of Field3D and Field2D Field3D operator-(const Field3D& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(lhs)}; @@ -376,6 +202,7 @@ Field3D operator-(const Field3D& lhs, const Field2D& rhs) { // Provide the C++ operator to update Field3D by subtraction with Field2D Field3D& Field3D::operator-=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -405,6 +232,7 @@ Field3D& Field3D::operator-=(const Field2D& rhs) { // Provide the C++ wrapper for multiplication of Field3D and FieldPerp FieldPerp operator*(const Field3D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -425,6 +253,7 @@ FieldPerp operator*(const Field3D& lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for division of Field3D and FieldPerp FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -445,6 +274,7 @@ FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for addition of Field3D and FieldPerp FieldPerp operator+(const Field3D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -465,6 +295,7 @@ FieldPerp operator+(const Field3D& lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for subtraction of Field3D and FieldPerp FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -485,6 +316,7 @@ FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for multiplication of Field3D and BoutReal Field3D operator*(const Field3D& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field3D result{emptyFrom(lhs)}; checkData(lhs); @@ -502,6 +334,7 @@ Field3D operator*(const Field3D& lhs, const BoutReal rhs) { // Provide the C++ operator to update Field3D by multiplication with BoutReal Field3D& Field3D::operator*=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -525,6 +358,7 @@ Field3D& Field3D::operator*=(const BoutReal rhs) { // Provide the C++ wrapper for division of Field3D and BoutReal Field3D operator/(const Field3D& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field3D result{emptyFrom(lhs)}; checkData(lhs); @@ -543,6 +377,7 @@ Field3D operator/(const Field3D& lhs, const BoutReal rhs) { // Provide the C++ operator to update Field3D by division with BoutReal Field3D& Field3D::operator/=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -567,6 +402,7 @@ Field3D& Field3D::operator/=(const BoutReal rhs) { // Provide the C++ wrapper for addition of Field3D and BoutReal Field3D operator+(const Field3D& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field3D result{emptyFrom(lhs)}; checkData(lhs); @@ -584,6 +420,7 @@ Field3D operator+(const Field3D& lhs, const BoutReal rhs) { // Provide the C++ operator to update Field3D by addition with BoutReal Field3D& Field3D::operator+=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -607,6 +444,7 @@ Field3D& Field3D::operator+=(const BoutReal rhs) { // Provide the C++ wrapper for subtraction of Field3D and BoutReal Field3D operator-(const Field3D& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field3D result{emptyFrom(lhs)}; checkData(lhs); @@ -624,6 +462,7 @@ Field3D operator-(const Field3D& lhs, const BoutReal rhs) { // Provide the C++ operator to update Field3D by subtraction with BoutReal Field3D& Field3D::operator-=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -647,6 +486,7 @@ Field3D& Field3D::operator-=(const BoutReal rhs) { // Provide the C++ wrapper for multiplication of Field2D and Field3D Field3D operator*(const Field2D& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(rhs)}; @@ -670,6 +510,7 @@ Field3D operator*(const Field2D& lhs, const Field3D& rhs) { // Provide the C++ wrapper for division of Field2D and Field3D Field3D operator/(const Field2D& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(rhs)}; @@ -693,6 +534,7 @@ Field3D operator/(const Field2D& lhs, const Field3D& rhs) { // Provide the C++ wrapper for addition of Field2D and Field3D Field3D operator+(const Field2D& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(rhs)}; @@ -716,6 +558,7 @@ Field3D operator+(const Field2D& lhs, const Field3D& rhs) { // Provide the C++ wrapper for subtraction of Field2D and Field3D Field3D operator-(const Field2D& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(rhs)}; @@ -739,6 +582,7 @@ Field3D operator-(const Field2D& lhs, const Field3D& rhs) { // Provide the C++ wrapper for multiplication of Field2D and Field2D Field2D operator*(const Field2D& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field2D result{emptyFrom(lhs)}; @@ -755,6 +599,7 @@ Field2D operator*(const Field2D& lhs, const Field2D& rhs) { // Provide the C++ operator to update Field2D by multiplication with Field2D Field2D& Field2D::operator*=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -775,6 +620,7 @@ Field2D& Field2D::operator*=(const Field2D& rhs) { // Provide the C++ wrapper for division of Field2D and Field2D Field2D operator/(const Field2D& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field2D result{emptyFrom(lhs)}; @@ -791,6 +637,7 @@ Field2D operator/(const Field2D& lhs, const Field2D& rhs) { // Provide the C++ operator to update Field2D by division with Field2D Field2D& Field2D::operator/=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -811,6 +658,7 @@ Field2D& Field2D::operator/=(const Field2D& rhs) { // Provide the C++ wrapper for addition of Field2D and Field2D Field2D operator+(const Field2D& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field2D result{emptyFrom(lhs)}; @@ -827,6 +675,7 @@ Field2D operator+(const Field2D& lhs, const Field2D& rhs) { // Provide the C++ operator to update Field2D by addition with Field2D Field2D& Field2D::operator+=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -847,6 +696,7 @@ Field2D& Field2D::operator+=(const Field2D& rhs) { // Provide the C++ wrapper for subtraction of Field2D and Field2D Field2D operator-(const Field2D& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field2D result{emptyFrom(lhs)}; @@ -863,6 +713,7 @@ Field2D operator-(const Field2D& lhs, const Field2D& rhs) { // Provide the C++ operator to update Field2D by subtraction with Field2D Field2D& Field2D::operator-=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -883,6 +734,7 @@ Field2D& Field2D::operator-=(const Field2D& rhs) { // Provide the C++ wrapper for multiplication of Field2D and FieldPerp FieldPerp operator*(const Field2D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -903,6 +755,7 @@ FieldPerp operator*(const Field2D& lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for division of Field2D and FieldPerp FieldPerp operator/(const Field2D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -923,6 +776,7 @@ FieldPerp operator/(const Field2D& lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for addition of Field2D and FieldPerp FieldPerp operator+(const Field2D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -943,6 +797,7 @@ FieldPerp operator+(const Field2D& lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for subtraction of Field2D and FieldPerp FieldPerp operator-(const Field2D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -963,6 +818,7 @@ FieldPerp operator-(const Field2D& lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for multiplication of Field2D and BoutReal Field2D operator*(const Field2D& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field2D result{emptyFrom(lhs)}; checkData(lhs); @@ -978,6 +834,7 @@ Field2D operator*(const Field2D& lhs, const BoutReal rhs) { // Provide the C++ operator to update Field2D by multiplication with BoutReal Field2D& Field2D::operator*=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -997,6 +854,7 @@ Field2D& Field2D::operator*=(const BoutReal rhs) { // Provide the C++ wrapper for division of Field2D and BoutReal Field2D operator/(const Field2D& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field2D result{emptyFrom(lhs)}; checkData(lhs); @@ -1013,6 +871,7 @@ Field2D operator/(const Field2D& lhs, const BoutReal rhs) { // Provide the C++ operator to update Field2D by division with BoutReal Field2D& Field2D::operator/=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1033,6 +892,7 @@ Field2D& Field2D::operator/=(const BoutReal rhs) { // Provide the C++ wrapper for addition of Field2D and BoutReal Field2D operator+(const Field2D& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field2D result{emptyFrom(lhs)}; checkData(lhs); @@ -1048,6 +908,7 @@ Field2D operator+(const Field2D& lhs, const BoutReal rhs) { // Provide the C++ operator to update Field2D by addition with BoutReal Field2D& Field2D::operator+=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1067,6 +928,7 @@ Field2D& Field2D::operator+=(const BoutReal rhs) { // Provide the C++ wrapper for subtraction of Field2D and BoutReal Field2D operator-(const Field2D& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field2D result{emptyFrom(lhs)}; checkData(lhs); @@ -1082,6 +944,7 @@ Field2D operator-(const Field2D& lhs, const BoutReal rhs) { // Provide the C++ operator to update Field2D by subtraction with BoutReal Field2D& Field2D::operator-=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1101,6 +964,7 @@ Field2D& Field2D::operator-=(const BoutReal rhs) { // Provide the C++ wrapper for multiplication of FieldPerp and Field3D FieldPerp operator*(const FieldPerp& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1121,6 +985,7 @@ FieldPerp operator*(const FieldPerp& lhs, const Field3D& rhs) { // Provide the C++ operator to update FieldPerp by multiplication with Field3D FieldPerp& FieldPerp::operator*=(const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1147,6 +1012,7 @@ FieldPerp& FieldPerp::operator*=(const Field3D& rhs) { // Provide the C++ wrapper for division of FieldPerp and Field3D FieldPerp operator/(const FieldPerp& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1167,6 +1033,7 @@ FieldPerp operator/(const FieldPerp& lhs, const Field3D& rhs) { // Provide the C++ operator to update FieldPerp by division with Field3D FieldPerp& FieldPerp::operator/=(const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1193,6 +1060,7 @@ FieldPerp& FieldPerp::operator/=(const Field3D& rhs) { // Provide the C++ wrapper for addition of FieldPerp and Field3D FieldPerp operator+(const FieldPerp& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1213,6 +1081,7 @@ FieldPerp operator+(const FieldPerp& lhs, const Field3D& rhs) { // Provide the C++ operator to update FieldPerp by addition with Field3D FieldPerp& FieldPerp::operator+=(const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1239,6 +1108,7 @@ FieldPerp& FieldPerp::operator+=(const Field3D& rhs) { // Provide the C++ wrapper for subtraction of FieldPerp and Field3D FieldPerp operator-(const FieldPerp& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1259,6 +1129,7 @@ FieldPerp operator-(const FieldPerp& lhs, const Field3D& rhs) { // Provide the C++ operator to update FieldPerp by subtraction with Field3D FieldPerp& FieldPerp::operator-=(const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1285,6 +1156,7 @@ FieldPerp& FieldPerp::operator-=(const Field3D& rhs) { // Provide the C++ wrapper for multiplication of FieldPerp and Field2D FieldPerp operator*(const FieldPerp& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1305,6 +1177,7 @@ FieldPerp operator*(const FieldPerp& lhs, const Field2D& rhs) { // Provide the C++ operator to update FieldPerp by multiplication with Field2D FieldPerp& FieldPerp::operator*=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1331,6 +1204,7 @@ FieldPerp& FieldPerp::operator*=(const Field2D& rhs) { // Provide the C++ wrapper for division of FieldPerp and Field2D FieldPerp operator/(const FieldPerp& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1351,6 +1225,7 @@ FieldPerp operator/(const FieldPerp& lhs, const Field2D& rhs) { // Provide the C++ operator to update FieldPerp by division with Field2D FieldPerp& FieldPerp::operator/=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1377,6 +1252,7 @@ FieldPerp& FieldPerp::operator/=(const Field2D& rhs) { // Provide the C++ wrapper for addition of FieldPerp and Field2D FieldPerp operator+(const FieldPerp& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1397,6 +1273,7 @@ FieldPerp operator+(const FieldPerp& lhs, const Field2D& rhs) { // Provide the C++ operator to update FieldPerp by addition with Field2D FieldPerp& FieldPerp::operator+=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1423,6 +1300,7 @@ FieldPerp& FieldPerp::operator+=(const Field2D& rhs) { // Provide the C++ wrapper for subtraction of FieldPerp and Field2D FieldPerp operator-(const FieldPerp& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1443,6 +1321,7 @@ FieldPerp operator-(const FieldPerp& lhs, const Field2D& rhs) { // Provide the C++ operator to update FieldPerp by subtraction with Field2D FieldPerp& FieldPerp::operator-=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1469,6 +1348,7 @@ FieldPerp& FieldPerp::operator-=(const Field2D& rhs) { // Provide the C++ wrapper for multiplication of FieldPerp and FieldPerp FieldPerp operator*(const FieldPerp& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1485,6 +1365,7 @@ FieldPerp operator*(const FieldPerp& lhs, const FieldPerp& rhs) { // Provide the C++ operator to update FieldPerp by multiplication with FieldPerp FieldPerp& FieldPerp::operator*=(const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1505,6 +1386,7 @@ FieldPerp& FieldPerp::operator*=(const FieldPerp& rhs) { // Provide the C++ wrapper for division of FieldPerp and FieldPerp FieldPerp operator/(const FieldPerp& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1521,6 +1403,7 @@ FieldPerp operator/(const FieldPerp& lhs, const FieldPerp& rhs) { // Provide the C++ operator to update FieldPerp by division with FieldPerp FieldPerp& FieldPerp::operator/=(const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1541,6 +1424,7 @@ FieldPerp& FieldPerp::operator/=(const FieldPerp& rhs) { // Provide the C++ wrapper for addition of FieldPerp and FieldPerp FieldPerp operator+(const FieldPerp& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1557,6 +1441,7 @@ FieldPerp operator+(const FieldPerp& lhs, const FieldPerp& rhs) { // Provide the C++ operator to update FieldPerp by addition with FieldPerp FieldPerp& FieldPerp::operator+=(const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1577,6 +1462,7 @@ FieldPerp& FieldPerp::operator+=(const FieldPerp& rhs) { // Provide the C++ wrapper for subtraction of FieldPerp and FieldPerp FieldPerp operator-(const FieldPerp& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1593,6 +1479,7 @@ FieldPerp operator-(const FieldPerp& lhs, const FieldPerp& rhs) { // Provide the C++ operator to update FieldPerp by subtraction with FieldPerp FieldPerp& FieldPerp::operator-=(const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1613,6 +1500,7 @@ FieldPerp& FieldPerp::operator-=(const FieldPerp& rhs) { // Provide the C++ wrapper for multiplication of FieldPerp and BoutReal FieldPerp operator*(const FieldPerp& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(lhs)}; checkData(lhs); @@ -1628,6 +1516,7 @@ FieldPerp operator*(const FieldPerp& lhs, const BoutReal rhs) { // Provide the C++ operator to update FieldPerp by multiplication with BoutReal FieldPerp& FieldPerp::operator*=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1647,6 +1536,7 @@ FieldPerp& FieldPerp::operator*=(const BoutReal rhs) { // Provide the C++ wrapper for division of FieldPerp and BoutReal FieldPerp operator/(const FieldPerp& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(lhs)}; checkData(lhs); @@ -1663,6 +1553,7 @@ FieldPerp operator/(const FieldPerp& lhs, const BoutReal rhs) { // Provide the C++ operator to update FieldPerp by division with BoutReal FieldPerp& FieldPerp::operator/=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1682,6 +1573,7 @@ FieldPerp& FieldPerp::operator/=(const BoutReal rhs) { // Provide the C++ wrapper for addition of FieldPerp and BoutReal FieldPerp operator+(const FieldPerp& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(lhs)}; checkData(lhs); @@ -1697,6 +1589,7 @@ FieldPerp operator+(const FieldPerp& lhs, const BoutReal rhs) { // Provide the C++ operator to update FieldPerp by addition with BoutReal FieldPerp& FieldPerp::operator+=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1716,6 +1609,7 @@ FieldPerp& FieldPerp::operator+=(const BoutReal rhs) { // Provide the C++ wrapper for subtraction of FieldPerp and BoutReal FieldPerp operator-(const FieldPerp& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(lhs)}; checkData(lhs); @@ -1731,6 +1625,7 @@ FieldPerp operator-(const FieldPerp& lhs, const BoutReal rhs) { // Provide the C++ operator to update FieldPerp by subtraction with BoutReal FieldPerp& FieldPerp::operator-=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1750,6 +1645,7 @@ FieldPerp& FieldPerp::operator-=(const BoutReal rhs) { // Provide the C++ wrapper for multiplication of BoutReal and Field3D Field3D operator*(const BoutReal lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field3D result{emptyFrom(rhs)}; checkData(lhs); @@ -1767,6 +1663,7 @@ Field3D operator*(const BoutReal lhs, const Field3D& rhs) { // Provide the C++ wrapper for division of BoutReal and Field3D Field3D operator/(const BoutReal lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field3D result{emptyFrom(rhs)}; checkData(lhs); @@ -1784,6 +1681,7 @@ Field3D operator/(const BoutReal lhs, const Field3D& rhs) { // Provide the C++ wrapper for addition of BoutReal and Field3D Field3D operator+(const BoutReal lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field3D result{emptyFrom(rhs)}; checkData(lhs); @@ -1801,6 +1699,7 @@ Field3D operator+(const BoutReal lhs, const Field3D& rhs) { // Provide the C++ wrapper for subtraction of BoutReal and Field3D Field3D operator-(const BoutReal lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field3D result{emptyFrom(rhs)}; checkData(lhs); @@ -1818,6 +1717,7 @@ Field3D operator-(const BoutReal lhs, const Field3D& rhs) { // Provide the C++ wrapper for multiplication of BoutReal and Field2D Field2D operator*(const BoutReal lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field2D result{emptyFrom(rhs)}; checkData(lhs); @@ -1833,6 +1733,7 @@ Field2D operator*(const BoutReal lhs, const Field2D& rhs) { // Provide the C++ wrapper for division of BoutReal and Field2D Field2D operator/(const BoutReal lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field2D result{emptyFrom(rhs)}; checkData(lhs); @@ -1848,6 +1749,7 @@ Field2D operator/(const BoutReal lhs, const Field2D& rhs) { // Provide the C++ wrapper for addition of BoutReal and Field2D Field2D operator+(const BoutReal lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field2D result{emptyFrom(rhs)}; checkData(lhs); @@ -1863,6 +1765,7 @@ Field2D operator+(const BoutReal lhs, const Field2D& rhs) { // Provide the C++ wrapper for subtraction of BoutReal and Field2D Field2D operator-(const BoutReal lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field2D result{emptyFrom(rhs)}; checkData(lhs); @@ -1878,6 +1781,7 @@ Field2D operator-(const BoutReal lhs, const Field2D& rhs) { // Provide the C++ wrapper for multiplication of BoutReal and FieldPerp FieldPerp operator*(const BoutReal lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(rhs)}; checkData(lhs); @@ -1893,6 +1797,7 @@ FieldPerp operator*(const BoutReal lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for division of BoutReal and FieldPerp FieldPerp operator/(const BoutReal lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(rhs)}; checkData(lhs); @@ -1908,6 +1813,7 @@ FieldPerp operator/(const BoutReal lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for addition of BoutReal and FieldPerp FieldPerp operator+(const BoutReal lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(rhs)}; checkData(lhs); @@ -1923,6 +1829,7 @@ FieldPerp operator+(const BoutReal lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for subtraction of BoutReal and FieldPerp FieldPerp operator-(const BoutReal lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(rhs)}; checkData(lhs); From 4d64ad2bbbd7bb1f717cf27b91cb85105dd6723d Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 30 May 2025 17:37:58 -0700 Subject: [PATCH 11/58] More operators - Working version --- .../elm-pb-outerloop/elm_pb_outerloop.cxx | 5 +- include/bout/assert.hxx | 1 + include/bout/bout_types.hxx | 11 ++ include/bout/field2d.hxx | 114 ++++++++++++++++-- include/bout/field3d.hxx | 53 ++++---- include/bout/fieldops.hxx | 47 ++++++-- include/bout/interpolation.hxx | 16 ++- include/bout/utils.hxx | 4 +- src/field/field2d.cxx | 3 +- src/field/generated_fieldops.cxx | 14 ++- src/field/vecops.cxx | 2 +- .../laplace/impls/naulin/naulin_laplace.cxx | 3 +- src/mesh/coordinates.cxx | 32 ++--- src/sys/derivs.cxx | 4 +- 14 files changed, 235 insertions(+), 74 deletions(-) diff --git a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx index 8e84901806..d985c3ef9d 100644 --- a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx +++ b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx @@ -1793,7 +1793,7 @@ class ELMpb : public PhysicsModel { // Calculate coefficient. hyper_mu_x = hyperviscos * metric->g_11 * SQ(metric->dx) - * abs(metric->g11 * D2DX2(U)) / (abs(U) + 1e-3); + * abs(Field3D{metric->g11 * D2DX2(U)}) / (abs(U) + 1e-3); hyper_mu_x.applyBoundary("dirichlet"); // Set to zero on all boundaries ddt(U) += hyper_mu_x * metric->g11 * D2DX2(U); @@ -1840,7 +1840,8 @@ class ELMpb : public PhysicsModel { ddt(U) -= 0.5 * Upara2 * bracket(Pi0, Dperp2Phi, bm_exb) / B0; Field3D B0phi = B0 * phi; mesh->communicate(B0phi); - Field3D B0phi0 = B0 * phi0; + Field2D res = B0 * phi0; + Field3D B0phi0 = res; mesh->communicate(B0phi0); ddt(U) += 0.5 * Upara2 * bracket(B0phi, Dperp2Pi0, bm_exb) / B0; ddt(U) += 0.5 * Upara2 * bracket(B0phi0, Dperp2Pi, bm_exb) / B0; diff --git a/include/bout/assert.hxx b/include/bout/assert.hxx index 653c44ed42..2909cfc3c3 100644 --- a/include/bout/assert.hxx +++ b/include/bout/assert.hxx @@ -38,6 +38,7 @@ #if CHECKLEVEL >= 1 #define ASSERT1(condition) \ if (!(condition)) { \ + abort(); \ throw BoutException("Assertion failed in {:s}, line {:d}: {:s}", __FILE__, __LINE__, \ #condition); \ } diff --git a/include/bout/bout_types.hxx b/include/bout/bout_types.hxx index c1f06fca7c..b2f38b61aa 100644 --- a/include/bout/bout_types.hxx +++ b/include/bout/bout_types.hxx @@ -140,4 +140,15 @@ struct enumWrapper { /// Boundary condition function using FuncPtr = BoutReal (*)(BoutReal t, BoutReal x, BoutReal y, BoutReal z); +template +struct Constant { + T val; + struct View { + T v; + View(T v) : v(v) {} + __device__ T operator()(int) const { return v; } + }; + operator View() const { return {val}; } +}; + #endif // BOUT_TYPES_H diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index db2ce194f8..9d9948296e 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -38,6 +38,8 @@ class Field2D; #include "bout/region.hxx" #include "bout/unused.hxx" +#include "bout/fieldops.hxx" + #if BOUT_HAS_RAJA #include "RAJA/RAJA.hpp" // using RAJA lib #endif @@ -45,6 +47,16 @@ class Field2D; class Field3D; class Mesh; +//template +//struct is_expr_field2d : std::false_type {}; + +//template +//inline constexpr bool is_expr_field2d_v = is_expr_field2d>::value; + +template +struct is_expr_field2d> + : std::integral_constant>::value && is_expr_field2d_v>> {}; + /*! * \brief 2D X-Y scalar fields * @@ -91,6 +103,14 @@ public: DirectionTypes directions_in = {YDirectionType::Standard, ZDirectionType::Average}); + template && is_expr_field2d_v>> + Field2D(const BinaryExpr& expr) { + Array data{expr.size()}; + expr.evaluate(&data[0]); + *this = std::move(Field2D{std::move(data), expr.getMesh(), expr.getLocation(), + expr.getDirections()}); + } /*! * Destructor */ @@ -166,6 +186,14 @@ public: */ Field2D& operator=(BoutReal rhs); + template + std::enable_if_t, Field2D&> + operator=(BinaryExpr& expr) { + std::cout << "RUNNING Field2D operator= with CUDA\n"; + expr.evaluate(&data[0]); + return *this; + } + ///////////////////////////////////////////////////////// // Data access @@ -310,30 +338,90 @@ private: // Non-member overloaded operators -Field2D operator+(const Field2D& lhs, const Field2D& rhs); -Field2D operator-(const Field2D& lhs, const Field2D& rhs); -Field2D operator*(const Field2D& lhs, const Field2D& rhs); -#if 0 -template && is_expr_field2d_v>> -BinaryExpr operator*(const L& lhs, const R& rhs) { +//Field2D operator+(const Field2D& lhs, const Field2D& rhs); +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr> +operator+(const L& lhs, const R& rhs) { + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + bout::op::Add{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + std::nullopt, + lhs.getMesh()->getRegion2D("RGN_ALL")}; +} +//Field2D operator-(const Field2D& lhs, const Field2D& rhs); +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr> +operator-(const L& lhs, const R& rhs) { + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + bout::op::Sub{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + std::nullopt, + lhs.getMesh()->getRegion2D("RGN_ALL")}; +} +//Field2D operator*(const Field2D& lhs, const Field2D& rhs); +#if 1 +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr> +operator*(const L& lhs, const R& rhs) { return BinaryExpr{static_cast(lhs), static_cast(rhs), bout::op::Mul{}, lhs.getMesh(), lhs.getLocation(), lhs.getDirections(), - lhs.getRegionID(), - (regionID.has_value() - ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; + std::nullopt, + lhs.getMesh()->getRegion2D("RGN_ALL")}; +} +#endif +//Field2D operator/(const Field2D& lhs, const Field2D& rhs); +#if 1 +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr> +operator/(const L& lhs, const R& rhs) { + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + bout::op::Div{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + std::nullopt, + lhs.getMesh()->getRegion2D("RGN_ALL")}; } #endif -Field2D operator/(const Field2D& lhs, const Field2D& rhs); Field3D operator+(const Field2D& lhs, const Field3D& rhs); Field3D operator-(const Field2D& lhs, const Field3D& rhs); -Field3D operator*(const Field2D& lhs, const Field3D& rhs); +//Field3D operator*(const Field2D& lhs, const Field3D& rhs); +template +std::enable_if_t && is_expr_field3d_v, + BinaryExpr> +operator*(const L& lhs, const R& rhs) { + //static_assert(always_false || always_false, "Hello"); + auto regionID = rhs.getRegionID(); + + //std::cout << "RUNNING Field3D * Field2D using BinaryExpr with CUDA" << "\n"; + int mesh_nz = rhs.getMesh()->LocalNz; + + return BinaryExpr{ + static_cast(lhs).setScale(1, mesh_nz), + static_cast(rhs), + bout::op::Mul{}, + rhs.getMesh(), + rhs.getLocation(), + rhs.getDirections(), + regionID, + rhs.getMesh()->getRegion("RGN_ALL")}; +} Field3D operator/(const Field2D& lhs, const Field3D& rhs); Field2D operator+(const Field2D& lhs, BoutReal rhs); diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 33e42cccbb..80956cd6f9 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -40,19 +40,6 @@ class Field3D; class Mesh; #include "bout/fieldops.hxx" -// Base template: nothing is an expression by default -template -struct is_expr_field3d : std::false_type {}; - -template -struct is_expr_field2d : std::false_type {}; - -// Helper variable template -template -inline constexpr bool is_expr_field3d_v = is_expr_field3d>::value; - -template -inline constexpr bool is_expr_field2d_v = is_expr_field2d>::value; /// Class for 3D X-Y-Z scalar fields /*! @@ -198,7 +185,8 @@ public: Field3D(Array data, Mesh* localmesh, CELL_LOC location = CELL_CENTRE, DirectionTypes directions_in = {YDirectionType::Standard, ZDirectionType::Standard}); - template + template || is_expr_field3d_v>> Field3D(const BinaryExpr& expr) { //std::cout << "RUNNING constructor from BinaryExpr\n"; Array data{expr.size()}; @@ -475,7 +463,8 @@ public: void operator=(const FieldPerp& rhs); Field3D& operator=(BoutReal val); template - Field3D& operator=(BinaryExpr& expr) { + std::enable_if_t, Field3D&> + operator=(BinaryExpr& expr) { std::cout << "RUNNING operator= with CUDA\n"; regionID = expr.getRegionID(); //expr.evaluate(&data[0]); @@ -811,7 +800,25 @@ Field3D operator/(const Field3D& lhs, BoutReal rhs); Field3D operator+(BoutReal lhs, const Field3D& rhs); Field3D operator-(BoutReal lhs, const Field3D& rhs); -Field3D operator*(BoutReal lhs, const Field3D& rhs); +//Field3D operator*(BoutReal lhs, const Field3D& rhs); +template +std::enable_if_t && is_expr_field3d_v, + BinaryExpr, R, bout::op::Mul>> +operator*(const L& lhs, const R& rhs) { + //static_assert(always_false || always_false, "Hello"); + auto regionID = rhs.getRegionID(); + + return BinaryExpr, R, bout::op::Mul>{ + static_cast::View>(lhs), + static_cast(rhs), + bout::op::Mul{}, + rhs.getMesh(), + rhs.getLocation(), + rhs.getDirections(), + regionID, + rhs.getMesh()->getRegion("RGN_ALL")}; +} + Field3D operator/(BoutReal lhs, const Field3D& rhs); /*! @@ -935,18 +942,6 @@ struct is_expr_field2d : std::true_type {}; template struct is_expr_field3d> - : std::integral_constant>::value> {}; - -template -struct is_expr_field2d> - : std::integral_constant>::value> {}; - -//template -//struct is_expr_field3d::View> : is_expr_field3d {}; - -//template -//struct is_expr_field3d< typename BinaryExpr::View > -// : std::integral_constant>::value> {}; -// //: is_expr_field3d> {}; + : std::integral_constant>::value || is_expr_field3d_v>> {}; #endif /* BOUT_FIELD3D_H */ diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index d83d1eedd1..b78dadc315 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -1,18 +1,46 @@ #pragma once -#include "bout/array.hxx" -#include #ifndef BOUT_FIELDOPS_HXX #define BOUT_FIELDOPS_HXX +#include "bout/array.hxx" #include "bout/bout_types.hxx" #include #include +#include +#include class Mesh; class Field3D; +class Field2D; + +template +struct is_expr_field2d : std::false_type {}; + +template +inline constexpr bool is_expr_field2d_v = is_expr_field2d>::value; + +// Base template: nothing is an expression by default +template +struct is_expr_field3d : std::false_type {}; + +// Helper variable template +template +inline constexpr bool is_expr_field3d_v = is_expr_field3d>::value; + +template +struct is_expr_boutreal : std::false_type {}; + +template +inline constexpr bool is_expr_boutreal_v = is_expr_boutreal>::value; + +template <> +struct is_expr_boutreal : std::true_type {}; + +template +struct is_expr_boutreal> + : std::integral_constant>> {}; -#include namespace bout { namespace op { @@ -144,6 +172,9 @@ struct BinaryExpr { //} } + BinaryExpr& operator=(BinaryExpr const&) = delete; + BinaryExpr& operator=(BinaryExpr&&) = delete; + inline int size() const { return indices.size(); } inline BoutReal operator()(int idx) const { return f(idx, lhs, rhs); // single‐pass fusion @@ -156,11 +187,13 @@ struct BinaryExpr { const int* indices; int num_indices; Func f; - int scale = 1; + int mul = 1; + int div = 1; int offset = 0; - View& setScale(int s) { - scale = s; + View& setScale(int mul, int div) { + this->mul = mul; + this->div = div; return *this; } View& setOffset(int o) { @@ -171,7 +204,7 @@ struct BinaryExpr { __device__ __forceinline__ int size() const { return num_indices; } __device__ __forceinline__ int regionIdx(int idx) const { return indices[idx]; } __device__ __forceinline__ BoutReal operator()(int idx) const { - return f(idx, lhs, rhs); // single‐pass fusion + return f((idx * mul) / div, lhs, rhs); // single‐pass fusion //return f(lhs(idx), rhs(idx)); // single‐pass fusion } }; diff --git a/include/bout/interpolation.hxx b/include/bout/interpolation.hxx index 2c7df4472d..85c04cf897 100644 --- a/include/bout/interpolation.hxx +++ b/include/bout/interpolation.hxx @@ -55,7 +55,8 @@ inline BoutReal interp(const stencil& s) { @param[in] region Region where output will be calculated */ template -const T interp_to(const T& var, CELL_LOC loc, const std::string region = "RGN_ALL") { +std::enable_if_t || bout::utils::is_Field3D_v, const T> +interp_to(const T& var, CELL_LOC loc, const std::string region = "RGN_ALL") { AUTO_TRACE(); static_assert(bout::utils::is_Field2D_v || bout::utils::is_Field3D_v, "interp_to must be templated with one of Field2D or Field3D."); @@ -203,4 +204,17 @@ const T interp_to(const T& var, CELL_LOC loc, const std::string region = "RGN_AL return result; } + +template +std::enable_if_t && !bout::utils::is_Field3D_v, const Field3D> +interp_to(const E &expr, CELL_LOC loc, const std::string rgn = "RGN_ALL") { + return interp_to( Field3D{expr}, loc, std::move(rgn) ); +} + +template +std::enable_if_t && !bout::utils::is_Field2D_v, const Field2D> +interp_to(const E &expr, CELL_LOC loc, const std::string rgn = "RGN_ALL") { + return interp_to( Field2D{expr}, loc, std::move(rgn) ); +} + #endif // BOUT_INTERP_H diff --git a/include/bout/utils.hxx b/include/bout/utils.hxx index e2ac814e53..42aa761886 100644 --- a/include/bout/utils.hxx +++ b/include/bout/utils.hxx @@ -422,12 +422,12 @@ inline BoutReal randomu() { * i.e. t * t */ template -inline T SQ(const T& t) { +inline auto SQ(const T& t) { return t * t; } template <> -BOUT_HOST_DEVICE inline BoutReal SQ(const BoutReal& t) { +BOUT_HOST_DEVICE inline auto SQ(const BoutReal& t) { return t * t; } diff --git a/src/field/field2d.cxx b/src/field/field2d.cxx index c8b9ebb689..e5c1d466b7 100644 --- a/src/field/field2d.cxx +++ b/src/field/field2d.cxx @@ -389,7 +389,8 @@ bool operator==(const Field2D& a, const Field2D& b) { if (!a.isAllocated() || !b.isAllocated()) { return false; } - return min(abs(a - b)) < 1e-10; + Field2D diff = a - b; + return min(abs(diff)) < 1e-10; } std::ostream& operator<<(std::ostream& out, const Field2D& value) { diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index 691e450b0c..439320b0ae 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -62,7 +62,7 @@ Field3D& Field3D::operator*=(const Field2D& rhs) { return *this; } -#if 1 +#if 0 // Provide the C++ wrapper for division of Field3D and Field2D Field3D operator/(const Field3D& lhs, const Field2D& rhs) { std::cout << "RUNNING operator "<< __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -485,6 +485,7 @@ Field3D& Field3D::operator-=(const BoutReal rhs) { } // Provide the C++ wrapper for multiplication of Field2D and Field3D +#if 0 Field3D operator*(const Field2D& lhs, const Field3D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); @@ -507,6 +508,7 @@ Field3D operator*(const Field2D& lhs, const Field3D& rhs) { checkData(result); return result; } +#endif // Provide the C++ wrapper for division of Field2D and Field3D Field3D operator/(const Field2D& lhs, const Field3D& rhs) { @@ -581,6 +583,7 @@ Field3D operator-(const Field2D& lhs, const Field3D& rhs) { } // Provide the C++ wrapper for multiplication of Field2D and Field2D +#if 0 Field2D operator*(const Field2D& lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); @@ -596,6 +599,7 @@ Field2D operator*(const Field2D& lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field2D by multiplication with Field2D Field2D& Field2D::operator*=(const Field2D& rhs) { @@ -618,6 +622,7 @@ Field2D& Field2D::operator*=(const Field2D& rhs) { return *this; } +#if 0 // Provide the C++ wrapper for division of Field2D and Field2D Field2D operator/(const Field2D& lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -634,6 +639,7 @@ Field2D operator/(const Field2D& lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field2D by division with Field2D Field2D& Field2D::operator/=(const Field2D& rhs) { @@ -657,6 +663,7 @@ Field2D& Field2D::operator/=(const Field2D& rhs) { } // Provide the C++ wrapper for addition of Field2D and Field2D +#if 0 Field2D operator+(const Field2D& lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); @@ -672,6 +679,7 @@ Field2D operator+(const Field2D& lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field2D by addition with Field2D Field2D& Field2D::operator+=(const Field2D& rhs) { @@ -695,6 +703,7 @@ Field2D& Field2D::operator+=(const Field2D& rhs) { } // Provide the C++ wrapper for subtraction of Field2D and Field2D +#if 0 Field2D operator-(const Field2D& lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); @@ -710,6 +719,7 @@ Field2D operator-(const Field2D& lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field2D by subtraction with Field2D Field2D& Field2D::operator-=(const Field2D& rhs) { @@ -1643,6 +1653,7 @@ FieldPerp& FieldPerp::operator-=(const BoutReal rhs) { return *this; } +#if 0 // Provide the C++ wrapper for multiplication of BoutReal and Field3D Field3D operator*(const BoutReal lhs, const Field3D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -1660,6 +1671,7 @@ Field3D operator*(const BoutReal lhs, const Field3D& rhs) { checkData(result); return result; } +#endif // Provide the C++ wrapper for division of BoutReal and Field3D Field3D operator/(const BoutReal lhs, const Field3D& rhs) { diff --git a/src/field/vecops.cxx b/src/field/vecops.cxx index 5f34e2af02..95409963f6 100644 --- a/src/field/vecops.cxx +++ b/src/field/vecops.cxx @@ -187,7 +187,7 @@ Field3D Div(const Vector3D& v, CELL_LOC outloc, const std::string& method) { Vector3D vcn = v; vcn.toContravariant(); - auto vcnJy = vcn.y.getCoordinates()->J * vcn.y; + Field3D vcnJy = vcn.y.getCoordinates()->J * vcn.y; if (v.y.hasParallelSlices()) { // If v.y has parallel slices then we are using ShiftedMetric (with // mesh:calcParallelSlices_on_communicate=true) or FCI, so we should calculate diff --git a/src/invert/laplace/impls/naulin/naulin_laplace.cxx b/src/invert/laplace/impls/naulin/naulin_laplace.cxx index e6f68d850d..203b0c0abd 100644 --- a/src/invert/laplace/impls/naulin/naulin_laplace.cxx +++ b/src/invert/laplace/impls/naulin/naulin_laplace.cxx @@ -269,8 +269,9 @@ Field3D LaplaceNaulin::solve(const Field3D& rhs, const Field3D& x0) { delp2solver->setCoefC2(C2coef_DC); // Use this below to normalize error for relative error estimate + Field3D SQField = SQ(rhsOverD); BoutReal RMS_rhsOverD = sqrt(mean( - SQ(rhsOverD), true, + SQField, true, "RGN_NOBNDRY")); // use sqrt(mean(SQ)) to make sure we do not divide by zero at a point BoutReal error_rel = 1e20, error_abs = 1e20, last_error = error_abs; diff --git a/src/mesh/coordinates.cxx b/src/mesh/coordinates.cxx index 8123720144..9861fe58bf 100644 --- a/src/mesh/coordinates.cxx +++ b/src/mesh/coordinates.cxx @@ -553,7 +553,8 @@ Coordinates::Coordinates(Mesh* mesh, Options* options) transform.get()); // Compare calculated and loaded values - output_warn.write("\tMaximum difference in J is {:e}\n", max(abs(J - Jcalc))); + Field2D diff = J - Jcalc; + output_warn.write("\tMaximum difference in J is {:e}\n", max(abs(diff))); communicate(J); @@ -578,7 +579,8 @@ Coordinates::Coordinates(Mesh* mesh, Options* options) Bxy = interpolateAndExtrapolate(Bxy, location, extrapolate_x, extrapolate_y, false, transform.get()); - output_warn.write("\tMaximum difference in Bxy is {:e}\n", max(abs(Bxy - Bcalc))); + FieldMetric diff = Bxy - Bcalc; + output_warn.write("\tMaximum difference in Bxy is {:e}\n", max(abs(diff))); } // Check Bxy @@ -759,8 +761,9 @@ Coordinates::Coordinates(Mesh* mesh, Options* options, const CELL_LOC loc, J = interpolateAndExtrapolate(J, location, extrapolate_x, extrapolate_y, false, transform.get()); + FieldMetric diff = J - Jcalc; // Compare calculated and loaded values - output_warn.write("\tMaximum difference in J is %e\n", max(abs(J - Jcalc))); + output_warn.write("\tMaximum difference in J is %e\n", max(abs(diff))); // Re-evaluate Bxy using new J Bxy = sqrt(g_22) / J; @@ -785,7 +788,8 @@ Coordinates::Coordinates(Mesh* mesh, Options* options, const CELL_LOC loc, Bxy = interpolateAndExtrapolate(Bxy, location, extrapolate_x, extrapolate_y, false, transform.get()); - output_warn.write("\tMaximum difference in Bxy is %e\n", max(abs(Bxy - Bcalc))); + FieldMetric diff = Bxy - Bcalc; + output_warn.write("\tMaximum difference in Bxy is %e\n", max(abs(diff))); } // Check Bxy @@ -1029,7 +1033,7 @@ int Coordinates::geometry(bool recalculate_staggered, G3_23 = 0.5 * g13 * (DDZ(g_12) + DDY(g_13) - DDX(g_23)) + 0.5 * g23 * DDZ(g_22) + 0.5 * g33 * DDY(g_33); - auto tmp = J * g12; + FieldMetric tmp = J * g12; communicate(tmp); G1 = (DDX(J * g11) + DDY(tmp) + DDZ(J * g13)) / J; tmp = J * g22; @@ -1268,9 +1272,9 @@ int Coordinates::calcCovariant(const std::string& region) { output_info.write("\tLocal maximum error in diagonal inversion is {:e}\n", maxerr); - maxerr = BOUTMAX(max(abs(g_11 * g12 + g_12 * g22 + g_13 * g23)), - max(abs(g_11 * g13 + g_12 * g23 + g_13 * g33)), - max(abs(g_12 * g13 + g_22 * g23 + g_23 * g33))); + maxerr = BOUTMAX(max(abs(FieldMetric{g_11 * g12 + g_12 * g22 + g_13 * g23})), + max(abs(FieldMetric{g_11 * g13 + g_12 * g23 + g_13 * g33})), + max(abs(FieldMetric{g_12 * g13 + g_22 * g23 + g_23 * g33}))); output_info.write("\tLocal maximum error in off-diagonal inversion is {:e}\n", maxerr); @@ -1324,9 +1328,9 @@ int Coordinates::calcContravariant(const std::string& region) { output_info.write("\tMaximum error in diagonal inversion is {:e}\n", maxerr); - maxerr = BOUTMAX(max(abs(g_11 * g12 + g_12 * g22 + g_13 * g23)), - max(abs(g_11 * g13 + g_12 * g23 + g_13 * g33)), - max(abs(g_12 * g13 + g_22 * g23 + g_23 * g33))); + maxerr = BOUTMAX(max(abs(FieldMetric{g_11 * g12 + g_12 * g22 + g_13 * g23})), + max(abs(FieldMetric{g_11 * g13 + g_12 * g23 + g_13 * g33})), + max(abs(FieldMetric{g_12 * g13 + g_22 * g23 + g_23 * g33}))); output_info.write("\tMaximum error in off-diagonal inversion is {:e}\n", maxerr); return 0; @@ -1339,13 +1343,13 @@ int Coordinates::jacobian() { const bool extrapolate_x = not localmesh->sourceHasXBoundaryGuards(); const bool extrapolate_y = not localmesh->sourceHasYBoundaryGuards(); - auto g = g11 * g22 * g33 + 2.0 * g12 * g13 * g23 - g11 * g23 * g23 - g22 * g13 * g13 - - g33 * g12 * g12; + auto g = FieldMetric{g11 * g22 * g33 + 2.0 * g12 * g13 * g23 - g11 * g23 * g23 + - g22 * g13 * g13 - g33 * g12 * g12}; // Check that g is positive bout::checkPositive(g, "The determinant of g^ij", "RGN_NOBNDRY"); - J = 1. / sqrt(g); + J = 1. / sqrt(Field2D{g}); // More robust to extrapolate derived quantities directly, rather than // deriving from extrapolated covariant metric components J = interpolateAndExtrapolate(J, location, extrapolate_x, extrapolate_y, false, diff --git a/src/sys/derivs.cxx b/src/sys/derivs.cxx index ee9bcbcc2c..f12c517f82 100644 --- a/src/sys/derivs.cxx +++ b/src/sys/derivs.cxx @@ -173,7 +173,7 @@ Coordinates::FieldMetric D2DX2(const Field2D& f, CELL_LOC outloc, const std::string& method, const std::string& region) { Coordinates* coords = f.getCoordinates(outloc); - auto result = + Field2D result = bout::derivatives::index::D2DX2(f, outloc, method, region) / SQ(coords->dx); if (coords->non_uniform) { @@ -210,7 +210,7 @@ Coordinates::FieldMetric D2DY2(const Field2D& f, CELL_LOC outloc, const std::string& method, const std::string& region) { Coordinates* coords = f.getCoordinates(outloc); - auto result = + Field2D result = bout::derivatives::index::D2DY2(f, outloc, method, region) / SQ(coords->dy); if (coords->non_uniform) { // Correction for non-uniform f.getMesh() From 5dfa66a75cba94f47aade6c6c63ca29afc0eb099 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Sat, 31 May 2025 08:13:20 -0700 Subject: [PATCH 12/58] Add more operators --- .../elm-pb-outerloop/elm_pb_outerloop.cxx | 4 +- include/bout/assert.hxx | 2 +- include/bout/field2d.hxx | 80 +++++++++++++++---- include/bout/field3d.hxx | 16 ++-- include/bout/fieldops.hxx | 49 +++--------- src/field/generated_fieldops.cxx | 6 ++ src/mesh/coordinates.cxx | 12 +-- 7 files changed, 102 insertions(+), 67 deletions(-) diff --git a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx index d985c3ef9d..ec2ab8e2a7 100644 --- a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx +++ b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx @@ -713,7 +713,7 @@ class ELMpb : public PhysicsModel { diamag_phi0 = false; K_H_term = false; } else { - Dphi0 = -D_min - 0.5 * D_0 * (1.0 - tanh(D_s * (x - x0))); + Dphi0 = -D_min - 0.5 * D_0 * (1.0 - tanh(Field2D{D_s * (x - x0)})); } if (sign < 0) { // change flow direction @@ -1213,7 +1213,7 @@ class ELMpb : public PhysicsModel { // Only if not restarting: Check initial perturbation // Set U to zero where P0 < vacuum_pressure - U = where(P0 - vacuum_pressure, U, 0.0); + U = where(Field2D{P0 - vacuum_pressure}, U, 0.0); if (constn0) { ubyn = U; diff --git a/include/bout/assert.hxx b/include/bout/assert.hxx index 2909cfc3c3..954ae8dba0 100644 --- a/include/bout/assert.hxx +++ b/include/bout/assert.hxx @@ -38,9 +38,9 @@ #if CHECKLEVEL >= 1 #define ASSERT1(condition) \ if (!(condition)) { \ - abort(); \ throw BoutException("Assertion failed in {:s}, line {:d}: {:s}", __FILE__, __LINE__, \ #condition); \ + abort(); \ } #else // CHECKLEVEL >= 1 #define ASSERT1(condition) diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index 9d9948296e..baff508331 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -47,15 +47,14 @@ class Field2D; class Field3D; class Mesh; -//template -//struct is_expr_field2d : std::false_type {}; - -//template -//inline constexpr bool is_expr_field2d_v = is_expr_field2d>::value; - template struct is_expr_field2d> - : std::integral_constant>::value && is_expr_field2d_v>> {}; + : std::integral_constant> + && is_expr_field2d_v>) + || (is_expr_constant_v> + && is_expr_field2d_v>) + || (is_expr_field2d_v> + && is_expr_constant_v>)> {}; /*! * \brief 2D X-Y scalar fields @@ -103,8 +102,11 @@ public: DirectionTypes directions_in = {YDirectionType::Standard, ZDirectionType::Average}); - template && is_expr_field2d_v>> + template < + typename L, typename R, typename Func, + typename = std::enable_if_t<(is_expr_field2d_v && is_expr_field2d_v) + || (is_expr_constant_v && is_expr_field2d_v) + || (is_expr_field2d_v && is_expr_constant_v)>> Field2D(const BinaryExpr& expr) { Array data{expr.size()}; expr.evaluate(&data[0]); @@ -188,9 +190,13 @@ public: template std::enable_if_t, Field2D&> - operator=(BinaryExpr& expr) { + operator=(const BinaryExpr& expr) { std::cout << "RUNNING Field2D operator= with CUDA\n"; - expr.evaluate(&data[0]); + if (isAllocated()) { + expr.evaluate(&data[0]); + } else { + *this = Field2D{expr}; + } return *this; } @@ -278,7 +284,19 @@ public: /// In-place division. Copy-on-write used if data is shared Field2D& operator/=(const Field2D& rhs); /// In-place division. Copy-on-write used if data is shared - Field2D& operator/=(BoutReal rhs); + //Field2D& operator/=(BoutReal rhs); + template >> + Field2D& operator/=(R rhs) { + //printf("RUNNING operator+= with CUDA\n"); + if (data.unique()) { + auto BE = (*this) / rhs; + BE.evaluate(&data[0]); + } else { + (*this) = (*this) / rhs; + } + + return *this; + } // FieldData virtual functions @@ -425,13 +443,47 @@ operator*(const L& lhs, const R& rhs) { Field3D operator/(const Field2D& lhs, const Field3D& rhs); Field2D operator+(const Field2D& lhs, BoutReal rhs); -Field2D operator-(const Field2D& lhs, BoutReal rhs); +//Field2D operator-(const Field2D& lhs, BoutReal rhs); +#if 1 +template +std::enable_if_t && is_expr_constant_v, + BinaryExpr, bout::op::Sub>> +operator-(const L& lhs, R rhs) { + return BinaryExpr, bout::op::Sub>{ + static_cast(lhs), + static_cast::View>(rhs), + bout::op::Sub{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + std::nullopt, + lhs.getMesh()->getRegion2D("RGN_ALL")}; +} +#endif Field2D operator*(const Field2D& lhs, BoutReal rhs); Field2D operator/(const Field2D& lhs, BoutReal rhs); Field2D operator+(BoutReal lhs, const Field2D& rhs); Field2D operator-(BoutReal lhs, const Field2D& rhs); -Field2D operator*(BoutReal lhs, const Field2D& rhs); +//Field2D operator*(BoutReal lhs, const Field2D& rhs); +#if 1 +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr, R, bout::op::Mul>> +operator*(L lhs, const R& rhs) { + //static_assert(always_false || always_false, "Hello"); + + return BinaryExpr, R, bout::op::Mul>{ + static_cast::View>(lhs), + static_cast(rhs), + bout::op::Mul{}, + rhs.getMesh(), + rhs.getLocation(), + rhs.getDirections(), + std::nullopt, + rhs.getMesh()->getRegion2D("RGN_ALL")}; +} +#endif Field2D operator/(BoutReal lhs, const Field2D& rhs); /*! diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 80956cd6f9..c11bab0599 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -467,8 +467,12 @@ public: operator=(BinaryExpr& expr) { std::cout << "RUNNING operator= with CUDA\n"; regionID = expr.getRegionID(); - //expr.evaluate(&data[0]); - expr.evaluateWithResult(static_cast(*this)); + if(isAllocated()) { + expr.evaluate(&data[0]); + } + else { + *this = Field3D{expr}; + } return *this; } @@ -481,15 +485,15 @@ public: Field3D& operator+=(const R& rhs) { //printf("RUNNING operator+= with CUDA\n"); if (data.unique()) { - printf("RUNNING operator+= with CUDA with evaluateWithResult\n"); + //std::cout << "RUNNING Field3D operator+= w/ CUDA" << __FILE__ << " " + // << std::to_string(__LINE__) << "\n"; // Delete existing parallel slices. We don't copy parallel slices, so any // that currently exist will be incorrect. clearParallelSlices(); auto BE = (*this) + rhs; regionID = BE.getRegionID(); - //BE.evaluate(&data[0]); - BE.evaluateWithResult(static_cast(*this)); + BE.evaluate(&data[0]); } else { (*this) = (*this) + rhs; } @@ -802,7 +806,7 @@ Field3D operator+(BoutReal lhs, const Field3D& rhs); Field3D operator-(BoutReal lhs, const Field3D& rhs); //Field3D operator*(BoutReal lhs, const Field3D& rhs); template -std::enable_if_t && is_expr_field3d_v, +std::enable_if_t && is_expr_field3d_v, BinaryExpr, R, bout::op::Mul>> operator*(const L& lhs, const R& rhs) { //static_assert(always_false || always_false, "Hello"); diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index b78dadc315..6ec7947be4 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -29,18 +29,20 @@ template inline constexpr bool is_expr_field3d_v = is_expr_field3d>::value; template -struct is_expr_boutreal : std::false_type {}; +struct is_expr_constant : std::bool_constant> {}; template -inline constexpr bool is_expr_boutreal_v = is_expr_boutreal>::value; - -template <> -struct is_expr_boutreal : std::true_type {}; +inline constexpr bool is_expr_constant_v = is_expr_constant>::value; template -struct is_expr_boutreal> - : std::integral_constant>> {}; +struct is_expr_constant> + : std::integral_constant>> {}; +// After the specialization… +static_assert(is_expr_constant_v> == true, + "Constant should be recognized as an expr_constant!"); +static_assert(is_expr_constant_v> == true, + "Constant should be recognized as an expr_constant!"); namespace bout { namespace op { @@ -101,7 +103,7 @@ __global__ __launch_bounds__(256) static void evaluatorExpr(BoutReal* out, const Expr expr) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid >= expr.size()) { - return; // Out of bounds + return; } int idx = expr.regionIdx(tid); out[idx] = expr(idx); // single‐pass fusion @@ -112,24 +114,6 @@ __global__ __launch_bounds__(256) static void evaluatorExpr(BoutReal* out, //} } -template -__global__ __launch_bounds__(256) static void evaluatorExprWithResult(Result res, - const Expr expr) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - if (tid >= expr.size()) { - return; // Out of bounds - } - int idx = expr.regionIdx(tid); - res[idx] = expr(idx); // single‐pass fusion - //res(idx, expr(idx)); // single‐pass fusion - //res(idx) = expr(idx); // single‐pass fusion - //int stride = blockDim.x * gridDim.x; - //for (int i = tid, e = expr.size(); i < e; i += stride) { - // int idx = expr.regionIdx(i); - // out[idx] = expr(idx); // single‐pass fusion - //} -} - inline std::unordered_map> regionIndicesCache; template @@ -222,21 +206,10 @@ struct BinaryExpr { //} } - template - void evaluateWithResult(const Result& res) const { - constexpr int THREADS = 256; - int blocks = (size() + THREADS - 1) / THREADS; - evaluatorExprWithResult<<>>(res, static_cast(*this)); - cudaDeviceSynchronize(); - //for(int i=0; i getRegionID() const { return regionID; }; }; -#endif // BOUT_EXPRESSION_HXX \ No newline at end of file +#endif // BOUT_EXPRESSION_HX \ No newline at end of file diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index 439320b0ae..9ae40ba41b 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -879,6 +879,7 @@ Field2D operator/(const Field2D& lhs, const BoutReal rhs) { return result; } +#if 0 // Provide the C++ operator to update Field2D by division with BoutReal Field2D& Field2D::operator/=(const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -899,6 +900,7 @@ Field2D& Field2D::operator/=(const BoutReal rhs) { } return *this; } +#endif // Provide the C++ wrapper for addition of Field2D and BoutReal Field2D operator+(const Field2D& lhs, const BoutReal rhs) { @@ -937,6 +939,7 @@ Field2D& Field2D::operator+=(const BoutReal rhs) { } // Provide the C++ wrapper for subtraction of Field2D and BoutReal +#if 0 Field2D operator-(const Field2D& lhs, const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -951,6 +954,7 @@ Field2D operator-(const Field2D& lhs, const BoutReal rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field2D by subtraction with BoutReal Field2D& Field2D::operator-=(const BoutReal rhs) { @@ -1727,6 +1731,7 @@ Field3D operator-(const BoutReal lhs, const Field3D& rhs) { return result; } +#if 0 // Provide the C++ wrapper for multiplication of BoutReal and Field2D Field2D operator*(const BoutReal lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -1742,6 +1747,7 @@ Field2D operator*(const BoutReal lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ wrapper for division of BoutReal and Field2D Field2D operator/(const BoutReal lhs, const Field2D& rhs) { diff --git a/src/mesh/coordinates.cxx b/src/mesh/coordinates.cxx index 9861fe58bf..8bded7fee5 100644 --- a/src/mesh/coordinates.cxx +++ b/src/mesh/coordinates.cxx @@ -1266,9 +1266,9 @@ int Coordinates::calcCovariant(const std::string& region) { } BoutReal maxerr; - maxerr = BOUTMAX(max(abs((g_11 * g11 + g_12 * g12 + g_13 * g13) - 1)), - max(abs((g_12 * g12 + g_22 * g22 + g_23 * g23) - 1)), - max(abs((g_13 * g13 + g_23 * g23 + g_33 * g33) - 1))); + maxerr = BOUTMAX(max(abs(FieldMetric{(g_11 * g11 + g_12 * g12 + g_13 * g13) - 1})), + max(abs(FieldMetric{(g_12 * g12 + g_22 * g22 + g_23 * g23) - 1})), + max(abs(FieldMetric{(g_13 * g13 + g_23 * g23 + g_33 * g33) - 1}))); output_info.write("\tLocal maximum error in diagonal inversion is {:e}\n", maxerr); @@ -1322,9 +1322,9 @@ int Coordinates::calcContravariant(const std::string& region) { } BoutReal maxerr; - maxerr = BOUTMAX(max(abs((g_11 * g11 + g_12 * g12 + g_13 * g13) - 1)), - max(abs((g_12 * g12 + g_22 * g22 + g_23 * g23) - 1)), - max(abs((g_13 * g13 + g_23 * g23 + g_33 * g33) - 1))); + maxerr = BOUTMAX(max(abs(FieldMetric{(g_11 * g11 + g_12 * g12 + g_13 * g13) - 1})), + max(abs(FieldMetric{(g_12 * g12 + g_22 * g22 + g_23 * g23) - 1})), + max(abs(FieldMetric{(g_13 * g13 + g_23 * g23 + g_33 * g33) - 1}))); output_info.write("\tMaximum error in diagonal inversion is {:e}\n", maxerr); From 026645a47a43e7f9f0cb748dac893313c2a3ba2f Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Sat, 31 May 2025 18:30:17 -0700 Subject: [PATCH 13/58] More operators macro for dedup definition per operator --- .../elm-pb-outerloop/elm_pb_outerloop.cxx | 3 +- include/bout/field2d.hxx | 127 ++++++++++++++++-- include/bout/field3d.hxx | 80 ++++++++++- include/bout/utils.hxx | 5 +- src/field/generated_fieldops.cxx | 24 ++++ src/mesh/coordinates.cxx | 4 +- src/physics/snb.cxx | 2 +- 7 files changed, 221 insertions(+), 24 deletions(-) diff --git a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx index ec2ab8e2a7..7d38780814 100644 --- a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx +++ b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx @@ -1031,7 +1031,8 @@ class ELMpb : public PhysicsModel { vacuum_trans *= pnorm; // Transitions from 0 in core to 1 in vacuum - vac_mask = (1.0 - tanh((P0 - vacuum_pressure) / vacuum_trans)) / 2.0; + Field2D tanh_res = tanh(Field2D{(P0 - vacuum_pressure) / vacuum_trans}); + vac_mask = (1.0 - tanh_res) / 2.0; if (spitzer_resist) { // Use Spitzer resistivity diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index baff508331..97946ee713 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -270,17 +270,66 @@ public: } /// In-place addition. Copy-on-write used if data is shared - Field2D& operator+=(const Field2D& rhs); + //Field2D& operator+=(const Field2D& rhs); + template >> + Field2D& operator+=(const R& rhs) { + //printf("RUNNING operator+= with CUDA\n"); + if (data.unique()) { + auto BE = (*this) + rhs; + BE.evaluate(&data[0]); + } else { + (*this) = (*this) * rhs; + } + + return *this; + } /// In-place addition. Copy-on-write used if data is shared Field2D& operator+=(BoutReal rhs); /// In-place subtraction. Copy-on-write used if data is shared - Field2D& operator-=(const Field2D& rhs); + //Field2D& operator-=(const Field2D& rhs); + //here1 + template >> + Field2D& operator-=(const R& rhs) { + //printf("RUNNING operator+= with CUDA\n"); + if (data.unique()) { + auto BE = (*this) - rhs; + BE.evaluate(&data[0]); + } else { + (*this) = (*this) * rhs; + } + + return *this; + } /// In-place subtraction. Copy-on-write used if data is shared Field2D& operator-=(BoutReal rhs); /// In-place multiplication. Copy-on-write used if data is shared - Field2D& operator*=(const Field2D& rhs); + //Field2D& operator*=(const Field2D& rhs); + template >> + Field2D& operator*=(const R& rhs) { + //printf("RUNNING operator+= with CUDA\n"); + if (data.unique()) { + auto BE = (*this) * rhs; + BE.evaluate(&data[0]); + } else { + (*this) = (*this) * rhs; + } + + return *this; + } /// In-place multiplication. Copy-on-write used if data is shared - Field2D& operator*=(BoutReal rhs); + //Field2D& operator*=(BoutReal rhs); + template >> + Field2D& operator*=(R rhs) { + //printf("RUNNING operator+= with CUDA\n"); + if (data.unique()) { + auto BE = (*this) * rhs; + BE.evaluate(&data[0]); + } else { + (*this) = (*this) * rhs; + } + + return *this; + } /// In-place division. Copy-on-write used if data is shared Field2D& operator/=(const Field2D& rhs); /// In-place division. Copy-on-write used if data is shared @@ -444,7 +493,6 @@ Field3D operator/(const Field2D& lhs, const Field3D& rhs); Field2D operator+(const Field2D& lhs, BoutReal rhs); //Field2D operator-(const Field2D& lhs, BoutReal rhs); -#if 1 template std::enable_if_t && is_expr_constant_v, BinaryExpr, bout::op::Sub>> @@ -459,14 +507,56 @@ operator-(const L& lhs, R rhs) { std::nullopt, lhs.getMesh()->getRegion2D("RGN_ALL")}; } -#endif -Field2D operator*(const Field2D& lhs, BoutReal rhs); -Field2D operator/(const Field2D& lhs, BoutReal rhs); +//Field2D operator*(const Field2D& lhs, BoutReal rhs); +template +std::enable_if_t && is_expr_constant_v, + BinaryExpr, bout::op::Mul>> +operator*(const L& lhs, R rhs) { + return BinaryExpr, bout::op::Mul>{ + static_cast(lhs), + static_cast::View>(rhs), + bout::op::Mul{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + std::nullopt, + lhs.getMesh()->getRegion2D("RGN_ALL")}; +} +//Field2D operator/(const Field2D& lhs, BoutReal rhs); +template +std::enable_if_t && is_expr_constant_v, + BinaryExpr, bout::op::Div>> +operator/(const L& lhs, R rhs) { + return BinaryExpr, bout::op::Div>{ + static_cast(lhs), + static_cast::View>(rhs), + bout::op::Div{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + std::nullopt, + lhs.getMesh()->getRegion2D("RGN_ALL")}; +} Field2D operator+(BoutReal lhs, const Field2D& rhs); -Field2D operator-(BoutReal lhs, const Field2D& rhs); +//Field2D operator-(BoutReal lhs, const Field2D& rhs); +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr, R, bout::op::Sub>> +operator-(L lhs, const R& rhs) { + //static_assert(always_false || always_false, "Hello"); + + return BinaryExpr, R, bout::op::Sub>{ + static_cast::View>(lhs), + static_cast(rhs), + bout::op::Sub{}, + rhs.getMesh(), + rhs.getLocation(), + rhs.getDirections(), + std::nullopt, + rhs.getMesh()->getRegion2D("RGN_ALL")}; +} //Field2D operator*(BoutReal lhs, const Field2D& rhs); -#if 1 template std::enable_if_t && is_expr_field2d_v, BinaryExpr, R, bout::op::Mul>> @@ -483,8 +573,21 @@ operator*(L lhs, const R& rhs) { std::nullopt, rhs.getMesh()->getRegion2D("RGN_ALL")}; } -#endif -Field2D operator/(BoutReal lhs, const Field2D& rhs); +//Field2D operator/(BoutReal lhs, const Field2D& rhs); +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr, R, bout::op::Div>> +operator/(L lhs, const R& rhs) { + return BinaryExpr, R, bout::op::Div>{ + static_cast::View>(lhs), + static_cast(rhs), + bout::op::Div{}, + rhs.getMesh(), + rhs.getLocation(), + rhs.getDirections(), + std::nullopt, + rhs.getMesh()->getRegion2D("RGN_ALL")}; +} /*! * Unary minus. Returns the negative of given field, diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index c11bab0599..cab156db79 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -548,7 +548,25 @@ public: return *this; } Field3D& operator*=(const Field2D& rhs); - Field3D& operator*=(BoutReal rhs); + //Field3D& operator*=(BoutReal rhs); + // here1 + template >> + Field3D& operator*=(R rhs) { + //printf("RUNNING operator*= with CUDA\n"); + if (data.unique()) { + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + + auto BE = (*this) * rhs; + regionID = BE.getRegionID(); + BE.evaluate(&data[0]); + } else { + (*this) = (*this) * rhs; + } + + return *this; + } ///@} /// Division operators @@ -799,17 +817,51 @@ operator/(const L& lhs, const R& rhs) { Field3D operator+(const Field3D& lhs, BoutReal rhs); Field3D operator-(const Field3D& lhs, BoutReal rhs); -Field3D operator*(const Field3D& lhs, BoutReal rhs); +//Field3D operator*(const Field3D& lhs, BoutReal rhs); +//here2 +template +std::enable_if_t && is_expr_constant_v, + BinaryExpr, bout::op::Mul>> +operator*(const L& lhs, R rhs) { + //static_assert(always_false || always_false, "Hello"); + auto regionID = lhs.getRegionID(); + + return BinaryExpr, bout::op::Mul>{ + static_cast(lhs), + static_cast::View>(rhs), + bout::op::Mul{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + lhs.getMesh()->getRegion("RGN_ALL")}; +} Field3D operator/(const Field3D& lhs, BoutReal rhs); -Field3D operator+(BoutReal lhs, const Field3D& rhs); +//Field3D operator+(BoutReal lhs, const Field3D& rhs); +template +std::enable_if_t && is_expr_field3d_v, + BinaryExpr, R, bout::op::Add>> +operator+(const L& lhs, const R& rhs) { + auto regionID = rhs.getRegionID(); + + return BinaryExpr, R, bout::op::Add>{ + static_cast::View>(lhs), + static_cast(rhs), + bout::op::Add{}, + rhs.getMesh(), + rhs.getLocation(), + rhs.getDirections(), + regionID, + rhs.getMesh()->getRegion("RGN_ALL")}; +} Field3D operator-(BoutReal lhs, const Field3D& rhs); +#if 0 //Field3D operator*(BoutReal lhs, const Field3D& rhs); template std::enable_if_t && is_expr_field3d_v, BinaryExpr, R, bout::op::Mul>> operator*(const L& lhs, const R& rhs) { - //static_assert(always_false || always_false, "Hello"); auto regionID = rhs.getRegionID(); return BinaryExpr, R, bout::op::Mul>{ @@ -822,9 +874,29 @@ operator*(const L& lhs, const R& rhs) { regionID, rhs.getMesh()->getRegion("RGN_ALL")}; } +#endif Field3D operator/(BoutReal lhs, const Field3D& rhs); +#define FIELD3D_BOUTREAL_OP(OP_SYM, OP_KIND) \ + template \ + std::enable_if_t && is_expr_field3d_v, \ + BinaryExpr, R, bout::op::OP_KIND>> \ + operator OP_SYM(const L & lhs, const R & rhs) { \ + auto regionID = rhs.getRegionID(); \ + return BinaryExpr, R, bout::op::OP_KIND>{ \ + static_cast::View>(lhs), \ + static_cast(rhs), \ + bout::op::OP_KIND{}, \ + rhs.getMesh(), \ + rhs.getLocation(), \ + rhs.getDirections(), \ + regionID, \ + rhs.getMesh()->getRegion("RGN_ALL")}; \ + } + +FIELD3D_BOUTREAL_OP(*, Mul) + /*! * Unary minus. Returns the negative of given field, * iterates over whole domain including guard/boundary cells. diff --git a/include/bout/utils.hxx b/include/bout/utils.hxx index 42aa761886..c8383b12fa 100644 --- a/include/bout/utils.hxx +++ b/include/bout/utils.hxx @@ -426,10 +426,7 @@ inline auto SQ(const T& t) { return t * t; } -template <> -BOUT_HOST_DEVICE inline auto SQ(const BoutReal& t) { - return t * t; -} +BOUT_HOST_DEVICE inline BoutReal SQ(const BoutReal& t) { return t * t; } /*! * Round \p x to the nearest integer diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index 9ae40ba41b..4040f8a175 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -315,6 +315,8 @@ FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs) { } // Provide the C++ wrapper for multiplication of Field3D and BoutReal +// here2 +#if 0 Field3D operator*(const Field3D& lhs, const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -331,8 +333,11 @@ Field3D operator*(const Field3D& lhs, const BoutReal rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field3D by multiplication with BoutReal +// here1 +#if 0 Field3D& Field3D::operator*=(const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field @@ -355,6 +360,7 @@ Field3D& Field3D::operator*=(const BoutReal rhs) { } return *this; } +#endif // Provide the C++ wrapper for division of Field3D and BoutReal Field3D operator/(const Field3D& lhs, const BoutReal rhs) { @@ -601,6 +607,7 @@ Field2D operator*(const Field2D& lhs, const Field2D& rhs) { } #endif +#if 0 // Provide the C++ operator to update Field2D by multiplication with Field2D Field2D& Field2D::operator*=(const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -621,6 +628,7 @@ Field2D& Field2D::operator*=(const Field2D& rhs) { } return *this; } +#endif #if 0 // Provide the C++ wrapper for division of Field2D and Field2D @@ -681,6 +689,7 @@ Field2D operator+(const Field2D& lhs, const Field2D& rhs) { } #endif +#if 0 // Provide the C++ operator to update Field2D by addition with Field2D Field2D& Field2D::operator+=(const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -701,6 +710,7 @@ Field2D& Field2D::operator+=(const Field2D& rhs) { } return *this; } +#endif // Provide the C++ wrapper for subtraction of Field2D and Field2D #if 0 @@ -721,6 +731,7 @@ Field2D operator-(const Field2D& lhs, const Field2D& rhs) { } #endif +#if 0 // Provide the C++ operator to update Field2D by subtraction with Field2D Field2D& Field2D::operator-=(const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -741,6 +752,7 @@ Field2D& Field2D::operator-=(const Field2D& rhs) { } return *this; } +#endif // Provide the C++ wrapper for multiplication of Field2D and FieldPerp FieldPerp operator*(const Field2D& lhs, const FieldPerp& rhs) { @@ -826,6 +838,7 @@ FieldPerp operator-(const Field2D& lhs, const FieldPerp& rhs) { return result; } +#if 0 // Provide the C++ wrapper for multiplication of Field2D and BoutReal Field2D operator*(const Field2D& lhs, const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -841,7 +854,9 @@ Field2D operator*(const Field2D& lhs, const BoutReal rhs) { checkData(result); return result; } +#endif +#if 0 // Provide the C++ operator to update Field2D by multiplication with BoutReal Field2D& Field2D::operator*=(const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -861,7 +876,9 @@ Field2D& Field2D::operator*=(const BoutReal rhs) { } return *this; } +#endif +#if 0 // Provide the C++ wrapper for division of Field2D and BoutReal Field2D operator/(const Field2D& lhs, const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -878,6 +895,7 @@ Field2D operator/(const Field2D& lhs, const BoutReal rhs) { checkData(result); return result; } +#endif #if 0 // Provide the C++ operator to update Field2D by division with BoutReal @@ -1695,6 +1713,7 @@ Field3D operator/(const BoutReal lhs, const Field3D& rhs) { return result; } +#if 0 // Provide the C++ wrapper for addition of BoutReal and Field3D Field3D operator+(const BoutReal lhs, const Field3D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -1712,6 +1731,7 @@ Field3D operator+(const BoutReal lhs, const Field3D& rhs) { checkData(result); return result; } +#endif // Provide the C++ wrapper for subtraction of BoutReal and Field3D Field3D operator-(const BoutReal lhs, const Field3D& rhs) { @@ -1749,6 +1769,7 @@ Field2D operator*(const BoutReal lhs, const Field2D& rhs) { } #endif +#if 0 // Provide the C++ wrapper for division of BoutReal and Field2D Field2D operator/(const BoutReal lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -1764,6 +1785,7 @@ Field2D operator/(const BoutReal lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ wrapper for addition of BoutReal and Field2D Field2D operator+(const BoutReal lhs, const Field2D& rhs) { @@ -1781,6 +1803,7 @@ Field2D operator+(const BoutReal lhs, const Field2D& rhs) { return result; } +#if 0 // Provide the C++ wrapper for subtraction of BoutReal and Field2D Field2D operator-(const BoutReal lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -1796,6 +1819,7 @@ Field2D operator-(const BoutReal lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ wrapper for multiplication of BoutReal and FieldPerp FieldPerp operator*(const BoutReal lhs, const FieldPerp& rhs) { diff --git a/src/mesh/coordinates.cxx b/src/mesh/coordinates.cxx index 8bded7fee5..b8fd33c019 100644 --- a/src/mesh/coordinates.cxx +++ b/src/mesh/coordinates.cxx @@ -1102,7 +1102,7 @@ int Coordinates::geometry(bool recalculate_staggered, if (localmesh->get(d2x, "d2x" + suffix, 0.0, false, location)) { output_warn.write( "\tWARNING: differencing quantity 'd2x' not found. Calculating from dx\n"); - d1_dx = bout::derivatives::index::DDX(1. / dx); // d/di(1/dx) + d1_dx = bout::derivatives::index::DDX(FieldMetric{1. / dx}); // d/di(1/dx) communicate(d1_dx); d1_dx = @@ -1156,7 +1156,7 @@ int Coordinates::geometry(bool recalculate_staggered, if (localmesh->get(d2x, "d2x", 0.0, false)) { output_warn.write( "\tWARNING: differencing quantity 'd2x' not found. Calculating from dx\n"); - d1_dx = bout::derivatives::index::DDX(1. / dx); // d/di(1/dx) + d1_dx = bout::derivatives::index::DDX(FieldMetric{1. / dx}); // d/di(1/dx) communicate(d1_dx); d1_dx = diff --git a/src/physics/snb.cxx b/src/physics/snb.cxx index 80da9e1bf8..475b12ca1a 100644 --- a/src/physics/snb.cxx +++ b/src/physics/snb.cxx @@ -14,7 +14,7 @@ Field3D HeatFluxSNB::divHeatFlux(const Field3D& Te, const Field3D& Ne, Field3D thermal_speed = sqrt(2. * SI::qe * Te / SI::Me); BoutReal Y = SQ(SQ(SI::qe) / (SI::e0 * SI::Me)) / (4 * PI); - Field3D coulomb_log = 6.6 - 0.5 * log(Ne * 1e-20) + 1.5 * log(Te); + Field3D coulomb_log = 6.6 - 0.5 * log(Field3D{Ne * 1e-20}) + 1.5 * log(Te); // Thermal electron-electron mean free path [m] Field3D lambda_ee_T = pow(thermal_speed, 4) / (Y * Ne * coulomb_log); From 338920e284e39ae7f9954a0adcd3e87cb6444992 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Sat, 31 May 2025 21:17:34 -0700 Subject: [PATCH 14/58] More operators --- .../elm-pb-outerloop/elm_pb_outerloop.cxx | 2 +- include/bout/field3d.hxx | 129 +++-- include/bout/fieldops.hxx | 14 +- src/field/generated_fieldops.cxx | 541 +----------------- src/physics/snb.cxx | 2 +- 5 files changed, 76 insertions(+), 612 deletions(-) diff --git a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx index 7d38780814..901e57bf97 100644 --- a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx +++ b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx @@ -1283,7 +1283,7 @@ class ELMpb : public PhysicsModel { //////////////////////////////////////////// // Transitions from 0 in core to 1 in vacuum if (nonlinear) { - vac_mask = (1.0 - tanh(((P0 + P) - vacuum_pressure) / vacuum_trans)) / 2.0; + vac_mask = (1.0 - tanh(Field3D{((P0 + P) - vacuum_pressure) / vacuum_trans})) / 2.0; // Update resistivity if (spitzer_resist) { diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index cab156db79..d4dd1397a7 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -749,7 +749,7 @@ BinaryExpr operator/(const L& lhs, const R& rhs) { : lhs.getMesh()->getRegion("RGN_ALL"))}; } -Field3D operator+(const Field3D& lhs, const Field2D& rhs); +//Field3D operator+(const Field3D& lhs, const Field2D& rhs); #if 0 template && is_expr_field2d_v, @@ -771,8 +771,9 @@ BinaryExpr operator+(const L& lhs, const R& rhs) { rhs.getRegion("RGN_ALL")}; } #endif -Field3D operator-(const Field3D& lhs, const Field2D& rhs); +//Field3D operator-(const Field3D& lhs, const Field2D& rhs); //Field3D operator*(const Field3D& lhs, const Field2D& rhs); +#if 0 template std::enable_if_t && is_expr_field2d_v, BinaryExpr> @@ -793,7 +794,9 @@ operator*(const L& lhs, const R& rhs) { regionID, lhs.getMesh()->getRegion("RGN_ALL")}; } +#endif //Field3D operator/(const Field3D& lhs, const Field2D& rhs); +#if 0 template std::enable_if_t && is_expr_field2d_v, BinaryExpr> @@ -814,80 +817,73 @@ operator/(const L& lhs, const R& rhs) { regionID, lhs.getMesh()->getRegion("RGN_ALL")}; } +#endif -Field3D operator+(const Field3D& lhs, BoutReal rhs); -Field3D operator-(const Field3D& lhs, BoutReal rhs); +#define FIELD3D_FIELD3D_FIELD2D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field2d_v, \ + BinaryExpr> operator OP_SYM(const L & lhs, \ + const R & rhs) { \ + auto regionID = lhs.getRegionID(); \ + int mesh_nz = lhs.getMesh()->LocalNz; \ + return BinaryExpr{ \ + static_cast(lhs), \ + static_cast(rhs).setScale(1, mesh_nz), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + regionID, \ + lhs.getMesh()->getRegion("RGN_ALL")}; \ + } + +FIELD3D_FIELD3D_FIELD2D_OP(+, Add) +FIELD3D_FIELD3D_FIELD2D_OP(-, Sub) +FIELD3D_FIELD3D_FIELD2D_OP(*, Mul) +FIELD3D_FIELD3D_FIELD2D_OP(/, Div) + +//Field3D operator+(const Field3D& lhs, BoutReal rhs); +//Field3D operator-(const Field3D& lhs, BoutReal rhs); //Field3D operator*(const Field3D& lhs, BoutReal rhs); -//here2 -template -std::enable_if_t && is_expr_constant_v, - BinaryExpr, bout::op::Mul>> -operator*(const L& lhs, R rhs) { - //static_assert(always_false || always_false, "Hello"); - auto regionID = lhs.getRegionID(); +//Field3D operator/(const Field3D& lhs, BoutReal rhs); - return BinaryExpr, bout::op::Mul>{ - static_cast(lhs), - static_cast::View>(rhs), - bout::op::Mul{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - lhs.getMesh()->getRegion("RGN_ALL")}; -} -Field3D operator/(const Field3D& lhs, BoutReal rhs); +#define FIELD3D_FIELD3D_BOUTREAL_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_constant_v, \ + BinaryExpr, bout::op::OP_TYPE>> \ + operator OP_SYM(const L & lhs, R rhs) { \ + auto regionID = lhs.getRegionID(); \ + return BinaryExpr, bout::op::OP_TYPE>{ \ + static_cast(lhs), \ + static_cast::View>(rhs), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + regionID, \ + lhs.getMesh()->getRegion("RGN_ALL")}; \ + } + +FIELD3D_FIELD3D_BOUTREAL_OP(+, Add) +FIELD3D_FIELD3D_BOUTREAL_OP(-, Sub) +FIELD3D_FIELD3D_BOUTREAL_OP(*, Mul) +FIELD3D_FIELD3D_BOUTREAL_OP(/, Div) //Field3D operator+(BoutReal lhs, const Field3D& rhs); -template -std::enable_if_t && is_expr_field3d_v, - BinaryExpr, R, bout::op::Add>> -operator+(const L& lhs, const R& rhs) { - auto regionID = rhs.getRegionID(); - - return BinaryExpr, R, bout::op::Add>{ - static_cast::View>(lhs), - static_cast(rhs), - bout::op::Add{}, - rhs.getMesh(), - rhs.getLocation(), - rhs.getDirections(), - regionID, - rhs.getMesh()->getRegion("RGN_ALL")}; -} -Field3D operator-(BoutReal lhs, const Field3D& rhs); -#if 0 +//Field3D operator-(BoutReal lhs, const Field3D& rhs); //Field3D operator*(BoutReal lhs, const Field3D& rhs); -template -std::enable_if_t && is_expr_field3d_v, - BinaryExpr, R, bout::op::Mul>> -operator*(const L& lhs, const R& rhs) { - auto regionID = rhs.getRegionID(); - - return BinaryExpr, R, bout::op::Mul>{ - static_cast::View>(lhs), - static_cast(rhs), - bout::op::Mul{}, - rhs.getMesh(), - rhs.getLocation(), - rhs.getDirections(), - regionID, - rhs.getMesh()->getRegion("RGN_ALL")}; -} -#endif - -Field3D operator/(BoutReal lhs, const Field3D& rhs); +//Field3D operator/(BoutReal lhs, const Field3D& rhs); -#define FIELD3D_BOUTREAL_OP(OP_SYM, OP_KIND) \ +#define FIELD3D_BOUTREAL_FIELD3D_OP(OP_SYM, OP_TYPE) \ template \ std::enable_if_t && is_expr_field3d_v, \ - BinaryExpr, R, bout::op::OP_KIND>> \ - operator OP_SYM(const L & lhs, const R & rhs) { \ + BinaryExpr, R, bout::op::OP_TYPE>> \ + operator OP_SYM(const L & lhs, const R & rhs) { \ auto regionID = rhs.getRegionID(); \ - return BinaryExpr, R, bout::op::OP_KIND>{ \ + return BinaryExpr, R, bout::op::OP_TYPE>{ \ static_cast::View>(lhs), \ static_cast(rhs), \ - bout::op::OP_KIND{}, \ + bout::op::OP_TYPE{}, \ rhs.getMesh(), \ rhs.getLocation(), \ rhs.getDirections(), \ @@ -895,7 +891,10 @@ Field3D operator/(BoutReal lhs, const Field3D& rhs); rhs.getMesh()->getRegion("RGN_ALL")}; \ } -FIELD3D_BOUTREAL_OP(*, Mul) +FIELD3D_BOUTREAL_FIELD3D_OP(+, Add) +FIELD3D_BOUTREAL_FIELD3D_OP(-, Sub) +FIELD3D_BOUTREAL_FIELD3D_OP(*, Mul) +FIELD3D_BOUTREAL_FIELD3D_OP(/, Div) /*! * Unary minus. Returns the negative of given field, diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 6ec7947be4..0de73d1205 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -38,12 +38,7 @@ template struct is_expr_constant> : std::integral_constant>> {}; -// After the specialization… -static_assert(is_expr_constant_v> == true, - "Constant should be recognized as an expr_constant!"); -static_assert(is_expr_constant_v> == true, - "Constant should be recognized as an expr_constant!"); - +constexpr int THREADS = 256; namespace bout { namespace op { struct Assign { @@ -99,8 +94,7 @@ struct Add { }; template -__global__ __launch_bounds__(256) static void evaluatorExpr(BoutReal* out, - const Expr expr) { +__global__ void __launch_bounds__(THREADS) evaluatorExpr(BoutReal* out, const Expr expr) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid >= expr.size()) { return; @@ -197,13 +191,9 @@ struct BinaryExpr { operator View() const { return View{lhs, rhs, &indices[0], indices.size(), f}; } void evaluate(BoutReal* data) const { - constexpr int THREADS = 256; int blocks = (size() + THREADS - 1) / THREADS; evaluatorExpr<<>>(&data[0], static_cast(*this)); cudaDeviceSynchronize(); - //for(int i=0; i #include -// Provide the C++ wrapper for multiplication of Field3D and Field2D -#if 0 -Field3D operator*(const Field3D& lhs, const Field2D& rhs) { - std::cout << "RUNNING operator "<< __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[base_ind + jz] * rhs[index]; - } - } - - checkData(result); - return result; -} -#endif - // Provide the C++ operator to update Field3D by multiplication with Field2D Field3D& Field3D::operator*=(const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -63,65 +37,6 @@ Field3D& Field3D::operator*=(const Field2D& rhs) { } #if 0 -// Provide the C++ wrapper for division of Field3D and Field2D -Field3D operator/(const Field3D& lhs, const Field2D& rhs) { - std::cout << "RUNNING operator "<< __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - const auto tmp = 1.0 / rhs[index]; - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[base_ind + jz] * tmp; - } - } - - checkData(result); - return result; -} -#endif - -#if 0 -// Provide the C++ operator to update Field3D by division with Field2D -Field3D& Field3D::operator/=(const Field2D& rhs) { - std::cout << "RUNNING operator "<< __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = fieldmesh->ind2Dto3D(index); - const auto tmp = 1.0 / rhs[index]; - for (int jz = 0; jz < fieldmesh->LocalNz; ++jz) { - (*this)[base_ind + jz] *= tmp; - } - } - - checkData(*this); - - } else { - (*this) = (*this) / rhs; - } - return *this; -} -#endif - // Provide the C++ wrapper for addition of Field3D and Field2D Field3D operator+(const Field3D& lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -145,6 +60,7 @@ Field3D operator+(const Field3D& lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field3D by addition with Field2D Field3D& Field3D::operator+=(const Field2D& rhs) { @@ -176,6 +92,7 @@ Field3D& Field3D::operator+=(const Field2D& rhs) { return *this; } +#if 0 // Provide the C++ wrapper for subtraction of Field3D and Field2D Field3D operator-(const Field3D& lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -199,6 +116,7 @@ Field3D operator-(const Field3D& lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field3D by subtraction with Field2D Field3D& Field3D::operator-=(const Field2D& rhs) { @@ -314,54 +232,7 @@ FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs) { return result; } -// Provide the C++ wrapper for multiplication of Field3D and BoutReal -// here2 -#if 0 -Field3D operator*(const Field3D& lhs, const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * rhs; - } - - checkData(result); - return result; -} -#endif - -// Provide the C++ operator to update Field3D by multiplication with BoutReal -// here1 #if 0 -Field3D& Field3D::operator*=(const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= rhs; } - - checkData(*this); - - } else { - (*this) = (*this) * rhs; - } - return *this; -} -#endif - // Provide the C++ wrapper for division of Field3D and BoutReal Field3D operator/(const Field3D& lhs, const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -380,6 +251,7 @@ Field3D operator/(const Field3D& lhs, const BoutReal rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field3D by division with BoutReal Field3D& Field3D::operator/=(const BoutReal rhs) { @@ -406,6 +278,7 @@ Field3D& Field3D::operator/=(const BoutReal rhs) { return *this; } +#if 0 // Provide the C++ wrapper for addition of Field3D and BoutReal Field3D operator+(const Field3D& lhs, const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -423,6 +296,7 @@ Field3D operator+(const Field3D& lhs, const BoutReal rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field3D by addition with BoutReal Field3D& Field3D::operator+=(const BoutReal rhs) { @@ -448,6 +322,7 @@ Field3D& Field3D::operator+=(const BoutReal rhs) { return *this; } +#if 0 // Provide the C++ wrapper for subtraction of Field3D and BoutReal Field3D operator-(const Field3D& lhs, const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -465,6 +340,7 @@ Field3D operator-(const Field3D& lhs, const BoutReal rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field3D by subtraction with BoutReal Field3D& Field3D::operator-=(const BoutReal rhs) { @@ -490,32 +366,6 @@ Field3D& Field3D::operator-=(const BoutReal rhs) { return *this; } -// Provide the C++ wrapper for multiplication of Field2D and Field3D -#if 0 -Field3D operator*(const Field2D& lhs, const Field3D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, lhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[index] * rhs[base_ind + jz]; - } - } - - checkData(result); - return result; -} -#endif - // Provide the C++ wrapper for division of Field2D and Field3D Field3D operator/(const Field2D& lhs, const Field3D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -588,67 +438,6 @@ Field3D operator-(const Field2D& lhs, const Field3D& rhs) { return result; } -// Provide the C++ wrapper for multiplication of Field2D and Field2D -#if 0 -Field2D operator*(const Field2D& lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * rhs[index]; - } - - checkData(result); - return result; -} -#endif - -#if 0 -// Provide the C++ operator to update Field2D by multiplication with Field2D -Field2D& Field2D::operator*=(const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) * rhs; - } - return *this; -} -#endif - -#if 0 -// Provide the C++ wrapper for division of Field2D and Field2D -Field2D operator/(const Field2D& lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] / rhs[index]; - } - - checkData(result); - return result; -} -#endif - // Provide the C++ operator to update Field2D by division with Field2D Field2D& Field2D::operator/=(const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -670,90 +459,6 @@ Field2D& Field2D::operator/=(const Field2D& rhs) { return *this; } -// Provide the C++ wrapper for addition of Field2D and Field2D -#if 0 -Field2D operator+(const Field2D& lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] + rhs[index]; - } - - checkData(result); - return result; -} -#endif - -#if 0 -// Provide the C++ operator to update Field2D by addition with Field2D -Field2D& Field2D::operator+=(const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] += rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) + rhs; - } - return *this; -} -#endif - -// Provide the C++ wrapper for subtraction of Field2D and Field2D -#if 0 -Field2D operator-(const Field2D& lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] - rhs[index]; - } - - checkData(result); - return result; -} -#endif - -#if 0 -// Provide the C++ operator to update Field2D by subtraction with Field2D -Field2D& Field2D::operator-=(const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] -= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) - rhs; - } - return *this; -} -#endif - // Provide the C++ wrapper for multiplication of Field2D and FieldPerp FieldPerp operator*(const Field2D& lhs, const FieldPerp& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -838,88 +543,6 @@ FieldPerp operator-(const Field2D& lhs, const FieldPerp& rhs) { return result; } -#if 0 -// Provide the C++ wrapper for multiplication of Field2D and BoutReal -Field2D operator*(const Field2D& lhs, const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * rhs; - } - - checkData(result); - return result; -} -#endif - -#if 0 -// Provide the C++ operator to update Field2D by multiplication with BoutReal -Field2D& Field2D::operator*=(const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= rhs; } - - checkData(*this); - - } else { - (*this) = (*this) * rhs; - } - return *this; -} -#endif - -#if 0 -// Provide the C++ wrapper for division of Field2D and BoutReal -Field2D operator/(const Field2D& lhs, const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - const auto tmp = 1.0 / rhs; - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * tmp; - } - - checkData(result); - return result; -} -#endif - -#if 0 -// Provide the C++ operator to update Field2D by division with BoutReal -Field2D& Field2D::operator/=(const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - checkData(*this); - checkData(rhs); - - const auto tmp = 1.0 / rhs; - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= tmp; } - - checkData(*this); - - } else { - (*this) = (*this) / rhs; - } - return *this; -} -#endif - // Provide the C++ wrapper for addition of Field2D and BoutReal Field2D operator+(const Field2D& lhs, const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -956,24 +579,6 @@ Field2D& Field2D::operator+=(const BoutReal rhs) { return *this; } -// Provide the C++ wrapper for subtraction of Field2D and BoutReal -#if 0 -Field2D operator-(const Field2D& lhs, const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] - rhs; - } - - checkData(result); - return result; -} -#endif - // Provide the C++ operator to update Field2D by subtraction with BoutReal Field2D& Field2D::operator-=(const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -1675,118 +1280,6 @@ FieldPerp& FieldPerp::operator-=(const BoutReal rhs) { return *this; } -#if 0 -// Provide the C++ wrapper for multiplication of BoutReal and Field3D -Field3D operator*(const BoutReal lhs, const Field3D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs * rhs[index]; - } - - checkData(result); - return result; -} -#endif - -// Provide the C++ wrapper for division of BoutReal and Field3D -Field3D operator/(const BoutReal lhs, const Field3D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs / rhs[index]; - } - - checkData(result); - return result; -} - -#if 0 -// Provide the C++ wrapper for addition of BoutReal and Field3D -Field3D operator+(const BoutReal lhs, const Field3D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs + rhs[index]; - } - - checkData(result); - return result; -} -#endif - -// Provide the C++ wrapper for subtraction of BoutReal and Field3D -Field3D operator-(const BoutReal lhs, const Field3D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs - rhs[index]; - } - - checkData(result); - return result; -} - -#if 0 -// Provide the C++ wrapper for multiplication of BoutReal and Field2D -Field2D operator*(const BoutReal lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field2D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs * rhs[index]; - } - - checkData(result); - return result; -} -#endif - -#if 0 -// Provide the C++ wrapper for division of BoutReal and Field2D -Field2D operator/(const BoutReal lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field2D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs / rhs[index]; - } - - checkData(result); - return result; -} -#endif - // Provide the C++ wrapper for addition of BoutReal and Field2D Field2D operator+(const BoutReal lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -1803,24 +1296,6 @@ Field2D operator+(const BoutReal lhs, const Field2D& rhs) { return result; } -#if 0 -// Provide the C++ wrapper for subtraction of BoutReal and Field2D -Field2D operator-(const BoutReal lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field2D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs - rhs[index]; - } - - checkData(result); - return result; -} -#endif - // Provide the C++ wrapper for multiplication of BoutReal and FieldPerp FieldPerp operator*(const BoutReal lhs, const FieldPerp& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; diff --git a/src/physics/snb.cxx b/src/physics/snb.cxx index 475b12ca1a..f21bfb7ee0 100644 --- a/src/physics/snb.cxx +++ b/src/physics/snb.cxx @@ -11,7 +11,7 @@ namespace bout { Field3D HeatFluxSNB::divHeatFlux(const Field3D& Te, const Field3D& Ne, Field3D* Div_Q_SH_out) { - Field3D thermal_speed = sqrt(2. * SI::qe * Te / SI::Me); + Field3D thermal_speed = sqrt(Field3D{2. * SI::qe * Te / SI::Me}); BoutReal Y = SQ(SQ(SI::qe) / (SI::e0 * SI::Me)) / (4 * PI); Field3D coulomb_log = 6.6 - 0.5 * log(Field3D{Ne * 1e-20}) + 1.5 * log(Te); From 6e8818114e398258ca548d3b85e061d92c78264e Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Sat, 31 May 2025 23:25:47 -0700 Subject: [PATCH 15/58] More operators and cleanup --- include/bout/field2d.hxx | 25 ++++- include/bout/field3d.hxx | 157 +++++-------------------------- src/field/generated_fieldops.cxx | 2 + 3 files changed, 47 insertions(+), 137 deletions(-) diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index 97946ee713..b7911b7e76 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -538,7 +538,8 @@ operator/(const L& lhs, R rhs) { lhs.getMesh()->getRegion2D("RGN_ALL")}; } -Field2D operator+(BoutReal lhs, const Field2D& rhs); +#if 0 +//Field2D operator+(BoutReal lhs, const Field2D& rhs); //Field2D operator-(BoutReal lhs, const Field2D& rhs); template std::enable_if_t && is_expr_field2d_v, @@ -588,6 +589,28 @@ operator/(L lhs, const R& rhs) { std::nullopt, rhs.getMesh()->getRegion2D("RGN_ALL")}; } +#endif + +#define FIELD2D_BOUTREAL_FIELD2D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field2d_v, \ + BinaryExpr, R, bout::op::OP_TYPE>> \ + operator OP_SYM(L lhs, const R & rhs) { \ + return BinaryExpr, R, bout::op::OP_TYPE>{ \ + static_cast::View>(lhs), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + rhs.getMesh(), \ + rhs.getLocation(), \ + rhs.getDirections(), \ + std::nullopt, \ + rhs.getMesh()->getRegion2D("RGN_ALL")}; \ + } + +FIELD2D_BOUTREAL_FIELD2D_OP(+, Add) +FIELD2D_BOUTREAL_FIELD2D_OP(-, Sub) +FIELD2D_BOUTREAL_FIELD2D_OP(*, Mul) +FIELD2D_BOUTREAL_FIELD2D_OP(/, Div) /*! * Unary minus. Returns the negative of given field, diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index d4dd1397a7..8f433909f9 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -676,148 +676,33 @@ FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator*(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); -template && is_expr_field3d_v>> -BinaryExpr operator+(const L& lhs, const R& rhs) { - auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); - - //std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{static_cast(lhs), - static_cast(rhs), - bout::op::Add{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - (regionID.has_value() - ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; -} - -template && is_expr_field3d_v>> -BinaryExpr operator-(const L& lhs, const R& rhs) { - auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); - - //std::cout << "RUNNING operator- using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{static_cast(lhs), - static_cast(rhs), - bout::op::Sub{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - (regionID.has_value() - ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; -} - -template -std::enable_if_t && is_expr_field3d_v, - BinaryExpr> -operator*(const L& lhs, const R& rhs) { - auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); - - //std::cout << "RUNNING operator* using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{static_cast(lhs), - static_cast(rhs), - bout::op::Mul{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - (regionID.has_value() - ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; -} +#define FIELD3D_FIELD3D_FIELD3D_OP(OP_SYM, OP_TYPE) \ + template && is_expr_field3d_v>> \ + BinaryExpr operator OP_SYM(const L & lhs, const R & rhs) { \ + auto regionID = \ + lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); \ + return BinaryExpr{ \ + static_cast(lhs), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + regionID, \ + (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) \ + : lhs.getMesh()->getRegion("RGN_ALL"))}; \ + } -template && is_expr_field3d_v>> -BinaryExpr operator/(const L& lhs, const R& rhs) { - auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); - - //std::cout << "RUNNING operator/ using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{static_cast(lhs), - static_cast(rhs), - bout::op::Div{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - (regionID.has_value() - ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; -} +FIELD3D_FIELD3D_FIELD3D_OP(+, Add) +FIELD3D_FIELD3D_FIELD3D_OP(-, Sub) +FIELD3D_FIELD3D_FIELD3D_OP(*, Mul) +FIELD3D_FIELD3D_FIELD3D_OP(/, Div) //Field3D operator+(const Field3D& lhs, const Field2D& rhs); -#if 0 -template && is_expr_field2d_v, - BinaryExpr>> -BinaryExpr operator+(const L& lhs, const R& rhs) { - //static_assert(always_false || always_false, "Hello"); - auto regionID = lhs.getRegionID(); - - std::cout << "RUNNING Field3D + Field2D using BinaryExpr with CUDA" << "\n"; - int mesh_nz = lhs.getMesh()->LocalNz; - - return BinaryExpr{static_cast(lhs), - static_cast(rhs).setScale(1, mesh_nz), - bout::op::Add{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - rhs.getRegion("RGN_ALL")}; -} -#endif //Field3D operator-(const Field3D& lhs, const Field2D& rhs); //Field3D operator*(const Field3D& lhs, const Field2D& rhs); -#if 0 -template -std::enable_if_t && is_expr_field2d_v, - BinaryExpr> -operator*(const L& lhs, const R& rhs) { - //static_assert(always_false || always_false, "Hello"); - auto regionID = lhs.getRegionID(); - - //std::cout << "RUNNING Field3D * Field2D using BinaryExpr with CUDA" << "\n"; - int mesh_nz = lhs.getMesh()->LocalNz; - - return BinaryExpr{ - static_cast(lhs), - static_cast(rhs).setScale(1, mesh_nz), - bout::op::Mul{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - lhs.getMesh()->getRegion("RGN_ALL")}; -} -#endif //Field3D operator/(const Field3D& lhs, const Field2D& rhs); -#if 0 -template -std::enable_if_t && is_expr_field2d_v, - BinaryExpr> -operator/(const L& lhs, const R& rhs) { - //static_assert(always_false || always_false, "Hello"); - auto regionID = lhs.getRegionID(); - - //std::cout << "RUNNING Field3D * Field2D using BinaryExpr with CUDA" << "\n"; - int mesh_nz = lhs.getMesh()->LocalNz; - - return BinaryExpr{ - static_cast(lhs), - static_cast(rhs).setScale(1, mesh_nz), - bout::op::Div{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - lhs.getMesh()->getRegion("RGN_ALL")}; -} -#endif #define FIELD3D_FIELD3D_FIELD2D_OP(OP_SYM, OP_TYPE) \ template \ diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index c69597c8db..dca1605b91 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -1280,6 +1280,7 @@ FieldPerp& FieldPerp::operator-=(const BoutReal rhs) { return *this; } +#if 0 // Provide the C++ wrapper for addition of BoutReal and Field2D Field2D operator+(const BoutReal lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -1295,6 +1296,7 @@ Field2D operator+(const BoutReal lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ wrapper for multiplication of BoutReal and FieldPerp FieldPerp operator*(const BoutReal lhs, const FieldPerp& rhs) { From 25d1272acdae941d8141fb671a97a790438cb0b5 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Sun, 1 Jun 2025 00:42:44 -0700 Subject: [PATCH 16/58] More operators and cleanup --- include/bout/field2d.hxx | 334 ++++++-------------------- include/bout/field3d.hxx | 158 ++----------- src/field/generated_fieldops.cxx | 386 +------------------------------ 3 files changed, 98 insertions(+), 780 deletions(-) diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index b7911b7e76..f430b98c6c 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -269,83 +269,23 @@ public: return operator()(jx, jy); } - /// In-place addition. Copy-on-write used if data is shared - //Field2D& operator+=(const Field2D& rhs); - template >> - Field2D& operator+=(const R& rhs) { - //printf("RUNNING operator+= with CUDA\n"); - if (data.unique()) { - auto BE = (*this) + rhs; - BE.evaluate(&data[0]); - } else { - (*this) = (*this) * rhs; - } - - return *this; - } - /// In-place addition. Copy-on-write used if data is shared - Field2D& operator+=(BoutReal rhs); - /// In-place subtraction. Copy-on-write used if data is shared - //Field2D& operator-=(const Field2D& rhs); - //here1 - template >> - Field2D& operator-=(const R& rhs) { - //printf("RUNNING operator+= with CUDA\n"); - if (data.unique()) { - auto BE = (*this) - rhs; - BE.evaluate(&data[0]); - } else { - (*this) = (*this) * rhs; - } - - return *this; +#define FIELD2D_OP_EQUALS(OP_SYM) \ + template \ + std::enable_if_t || is_expr_constant_v, Field2D&> \ + operator OP_SYM##=(R rhs) { \ + if (data.unique()) { \ + auto BE = (*this)OP_SYM rhs; \ + BE.evaluate(&data[0]); \ + } else { \ + (*this) = (*this)OP_SYM rhs; \ + } \ + return *this; \ } - /// In-place subtraction. Copy-on-write used if data is shared - Field2D& operator-=(BoutReal rhs); - /// In-place multiplication. Copy-on-write used if data is shared - //Field2D& operator*=(const Field2D& rhs); - template >> - Field2D& operator*=(const R& rhs) { - //printf("RUNNING operator+= with CUDA\n"); - if (data.unique()) { - auto BE = (*this) * rhs; - BE.evaluate(&data[0]); - } else { - (*this) = (*this) * rhs; - } - return *this; - } - /// In-place multiplication. Copy-on-write used if data is shared - //Field2D& operator*=(BoutReal rhs); - template >> - Field2D& operator*=(R rhs) { - //printf("RUNNING operator+= with CUDA\n"); - if (data.unique()) { - auto BE = (*this) * rhs; - BE.evaluate(&data[0]); - } else { - (*this) = (*this) * rhs; - } - - return *this; - } - /// In-place division. Copy-on-write used if data is shared - Field2D& operator/=(const Field2D& rhs); - /// In-place division. Copy-on-write used if data is shared - //Field2D& operator/=(BoutReal rhs); - template >> - Field2D& operator/=(R rhs) { - //printf("RUNNING operator+= with CUDA\n"); - if (data.unique()) { - auto BE = (*this) / rhs; - BE.evaluate(&data[0]); - } else { - (*this) = (*this) / rhs; - } - - return *this; - } + FIELD2D_OP_EQUALS(+) + FIELD2D_OP_EQUALS(-) + FIELD2D_OP_EQUALS(*) + FIELD2D_OP_EQUALS(/) // FieldData virtual functions @@ -405,191 +345,69 @@ private: // Non-member overloaded operators -//Field2D operator+(const Field2D& lhs, const Field2D& rhs); -template -std::enable_if_t && is_expr_field2d_v, - BinaryExpr> -operator+(const L& lhs, const R& rhs) { - return BinaryExpr{static_cast(lhs), - static_cast(rhs), - bout::op::Add{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - std::nullopt, - lhs.getMesh()->getRegion2D("RGN_ALL")}; -} -//Field2D operator-(const Field2D& lhs, const Field2D& rhs); -template -std::enable_if_t && is_expr_field2d_v, - BinaryExpr> -operator-(const L& lhs, const R& rhs) { - return BinaryExpr{static_cast(lhs), - static_cast(rhs), - bout::op::Sub{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - std::nullopt, - lhs.getMesh()->getRegion2D("RGN_ALL")}; -} -//Field2D operator*(const Field2D& lhs, const Field2D& rhs); -#if 1 -template -std::enable_if_t && is_expr_field2d_v, - BinaryExpr> -operator*(const L& lhs, const R& rhs) { - return BinaryExpr{static_cast(lhs), - static_cast(rhs), - bout::op::Mul{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - std::nullopt, - lhs.getMesh()->getRegion2D("RGN_ALL")}; -} -#endif -//Field2D operator/(const Field2D& lhs, const Field2D& rhs); -#if 1 -template -std::enable_if_t && is_expr_field2d_v, - BinaryExpr> -operator/(const L& lhs, const R& rhs) { - return BinaryExpr{static_cast(lhs), - static_cast(rhs), - bout::op::Div{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - std::nullopt, - lhs.getMesh()->getRegion2D("RGN_ALL")}; -} -#endif +#define FIELD2D_FIELD2D_FIELD2D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field2d_v, \ + BinaryExpr> operator OP_SYM(const L & lhs, \ + const R & rhs) { \ + return BinaryExpr{static_cast(lhs), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + std::nullopt, \ + lhs.getMesh()->getRegion2D("RGN_ALL")}; \ + } -Field3D operator+(const Field2D& lhs, const Field3D& rhs); -Field3D operator-(const Field2D& lhs, const Field3D& rhs); -//Field3D operator*(const Field2D& lhs, const Field3D& rhs); -template -std::enable_if_t && is_expr_field3d_v, - BinaryExpr> -operator*(const L& lhs, const R& rhs) { - //static_assert(always_false || always_false, "Hello"); - auto regionID = rhs.getRegionID(); - - //std::cout << "RUNNING Field3D * Field2D using BinaryExpr with CUDA" << "\n"; - int mesh_nz = rhs.getMesh()->LocalNz; - - return BinaryExpr{ - static_cast(lhs).setScale(1, mesh_nz), - static_cast(rhs), - bout::op::Mul{}, - rhs.getMesh(), - rhs.getLocation(), - rhs.getDirections(), - regionID, - rhs.getMesh()->getRegion("RGN_ALL")}; -} -Field3D operator/(const Field2D& lhs, const Field3D& rhs); - -Field2D operator+(const Field2D& lhs, BoutReal rhs); -//Field2D operator-(const Field2D& lhs, BoutReal rhs); -template -std::enable_if_t && is_expr_constant_v, - BinaryExpr, bout::op::Sub>> -operator-(const L& lhs, R rhs) { - return BinaryExpr, bout::op::Sub>{ - static_cast(lhs), - static_cast::View>(rhs), - bout::op::Sub{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - std::nullopt, - lhs.getMesh()->getRegion2D("RGN_ALL")}; -} -//Field2D operator*(const Field2D& lhs, BoutReal rhs); -template -std::enable_if_t && is_expr_constant_v, - BinaryExpr, bout::op::Mul>> -operator*(const L& lhs, R rhs) { - return BinaryExpr, bout::op::Mul>{ - static_cast(lhs), - static_cast::View>(rhs), - bout::op::Mul{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - std::nullopt, - lhs.getMesh()->getRegion2D("RGN_ALL")}; -} -//Field2D operator/(const Field2D& lhs, BoutReal rhs); -template -std::enable_if_t && is_expr_constant_v, - BinaryExpr, bout::op::Div>> -operator/(const L& lhs, R rhs) { - return BinaryExpr, bout::op::Div>{ - static_cast(lhs), - static_cast::View>(rhs), - bout::op::Div{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - std::nullopt, - lhs.getMesh()->getRegion2D("RGN_ALL")}; -} +FIELD2D_FIELD2D_FIELD2D_OP(+, Add) +FIELD2D_FIELD2D_FIELD2D_OP(-, Sub) +FIELD2D_FIELD2D_FIELD2D_OP(*, Mul) +FIELD2D_FIELD2D_FIELD2D_OP(/, Div) + +#define FIELD3D_FIELD2D_FIELD3D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field3d_v, \ + BinaryExpr> operator OP_SYM(const L & lhs, \ + const R & rhs) { \ + auto regionID = rhs.getRegionID(); \ + int mesh_nz = rhs.getMesh()->LocalNz; \ + return BinaryExpr{ \ + static_cast(lhs).setScale(1, mesh_nz), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + rhs.getMesh(), \ + rhs.getLocation(), \ + rhs.getDirections(), \ + regionID, \ + rhs.getMesh()->getRegion("RGN_ALL")}; \ + } -#if 0 -//Field2D operator+(BoutReal lhs, const Field2D& rhs); -//Field2D operator-(BoutReal lhs, const Field2D& rhs); -template -std::enable_if_t && is_expr_field2d_v, - BinaryExpr, R, bout::op::Sub>> -operator-(L lhs, const R& rhs) { - //static_assert(always_false || always_false, "Hello"); - - return BinaryExpr, R, bout::op::Sub>{ - static_cast::View>(lhs), - static_cast(rhs), - bout::op::Sub{}, - rhs.getMesh(), - rhs.getLocation(), - rhs.getDirections(), - std::nullopt, - rhs.getMesh()->getRegion2D("RGN_ALL")}; -} -//Field2D operator*(BoutReal lhs, const Field2D& rhs); -template -std::enable_if_t && is_expr_field2d_v, - BinaryExpr, R, bout::op::Mul>> -operator*(L lhs, const R& rhs) { - //static_assert(always_false || always_false, "Hello"); - - return BinaryExpr, R, bout::op::Mul>{ - static_cast::View>(lhs), - static_cast(rhs), - bout::op::Mul{}, - rhs.getMesh(), - rhs.getLocation(), - rhs.getDirections(), - std::nullopt, - rhs.getMesh()->getRegion2D("RGN_ALL")}; -} -//Field2D operator/(BoutReal lhs, const Field2D& rhs); -template -std::enable_if_t && is_expr_field2d_v, - BinaryExpr, R, bout::op::Div>> -operator/(L lhs, const R& rhs) { - return BinaryExpr, R, bout::op::Div>{ - static_cast::View>(lhs), - static_cast(rhs), - bout::op::Div{}, - rhs.getMesh(), - rhs.getLocation(), - rhs.getDirections(), - std::nullopt, - rhs.getMesh()->getRegion2D("RGN_ALL")}; -} -#endif +FIELD3D_FIELD2D_FIELD3D_OP(+, Add) +FIELD3D_FIELD2D_FIELD3D_OP(-, Sub) +FIELD3D_FIELD2D_FIELD3D_OP(*, Mul) +FIELD3D_FIELD2D_FIELD3D_OP(/, Div) + +#define FIELD2D_FIELD2D_BOUTREAL_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_constant_v, \ + BinaryExpr, bout::op::OP_TYPE>> \ + operator OP_SYM(const L & lhs, R rhs) { \ + return BinaryExpr, bout::op::OP_TYPE>{ \ + static_cast(lhs), \ + static_cast::View>(rhs), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + std::nullopt, \ + lhs.getMesh()->getRegion2D("RGN_ALL")}; \ + } + +FIELD2D_FIELD2D_BOUTREAL_OP(+, Add) +FIELD2D_FIELD2D_BOUTREAL_OP(-, Sub) +FIELD2D_FIELD2D_BOUTREAL_OP(*, Mul) +FIELD2D_FIELD2D_BOUTREAL_OP(/, Div) #define FIELD2D_BOUTREAL_FIELD2D_OP(OP_SYM, OP_TYPE) \ template \ diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 8f433909f9..fac2adc337 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -478,135 +478,26 @@ public: ///@} - /// Addition operators - ///@{ - //Field3D& operator+=(const Field3D& rhs); - template >> - Field3D& operator+=(const R& rhs) { - //printf("RUNNING operator+= with CUDA\n"); - if (data.unique()) { - //std::cout << "RUNNING Field3D operator+= w/ CUDA" << __FILE__ << " " - // << std::to_string(__LINE__) << "\n"; - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - auto BE = (*this) + rhs; - regionID = BE.getRegionID(); - BE.evaluate(&data[0]); - } else { - (*this) = (*this) + rhs; - } - - return *this; - } - Field3D& operator+=(const Field2D& rhs); - Field3D& operator+=(BoutReal rhs); - ///@} - - /// Subtraction operators - ///@{ - //Field3D& operator-=(const Field3D& rhs); - template >> - Field3D& operator-=(const R& rhs) { - if (data.unique()) { - //printf("RUNNING operator-= with CUDA with BE\n"); - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - auto BE = (*this) - rhs; - BE.evaluate(&data[0]); - } else { - //printf("RUNNING operator-= with CUDA with operation\n"); - (*this) = (*this) - rhs; - } - - return *this; - } - Field3D& operator-=(const Field2D& rhs); - Field3D& operator-=(BoutReal rhs); - ///@} - - /// Multiplication operators - ///@{ - //Field3D& operator*=(const Field3D& rhs); - template >> - Field3D& operator*=(const R& rhs) { - //printf("RUNNING operator*= with CUDA\n"); - if (data.unique()) { - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - auto BE = (*this) * rhs; - regionID = BE.getRegionID(); - BE.evaluate(&data[0]); - } else { - (*this) = (*this) * rhs; - } - - return *this; +#define FIELD3D_OP_EQUALS(OP_SYM) \ + template \ + std::enable_if_t || is_expr_field2d_v \ + || is_expr_constant_v, \ + Field3D&> operator OP_SYM##=(const R & rhs) { \ + if (data.unique()) { \ + clearParallelSlices(); \ + auto Expr = (*this)OP_SYM rhs; \ + Expr.evaluate(&data[0]); \ + } else { \ + (*this) = (*this)OP_SYM rhs; \ + } \ + return *this; \ } - Field3D& operator*=(const Field2D& rhs); - //Field3D& operator*=(BoutReal rhs); - // here1 - template >> - Field3D& operator*=(R rhs) { - //printf("RUNNING operator*= with CUDA\n"); - if (data.unique()) { - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - auto BE = (*this) * rhs; - regionID = BE.getRegionID(); - BE.evaluate(&data[0]); - } else { - (*this) = (*this) * rhs; - } - return *this; - } - ///@} + FIELD3D_OP_EQUALS(+) + FIELD3D_OP_EQUALS(-) + FIELD3D_OP_EQUALS(*) + FIELD3D_OP_EQUALS(/) - /// Division operators - ///@{ - //Field3D& operator/=(const Field3D& rhs); - template - std::enable_if_t,Field3D&> operator/=(const R& rhs) { - //printf("RUNNING operator/= with CUDA\n"); - if (data.unique()) { - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - auto BE = (*this) / rhs; - regionID = BE.getRegionID(); - BE.evaluate(&data[0]); - } else { - (*this) = (*this) / rhs; - } - - return *this; - } - //Field3D& operator/=(const Field2D& rhs); - template -std::enable_if_t, Field3D&> operator/=(const R& rhs) { - //printf("RUNNING operator/= with CUDA\n"); - if (data.unique()) { - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - auto BE = (*this) / rhs; - BE.evaluate(&data[0]); - } else { - (*this) = (*this) / rhs; - } - - return *this; - } - Field3D& operator/=(BoutReal rhs); ///@} // FieldData virtual functions @@ -699,11 +590,6 @@ FIELD3D_FIELD3D_FIELD3D_OP(-, Sub) FIELD3D_FIELD3D_FIELD3D_OP(*, Mul) FIELD3D_FIELD3D_FIELD3D_OP(/, Div) -//Field3D operator+(const Field3D& lhs, const Field2D& rhs); -//Field3D operator-(const Field3D& lhs, const Field2D& rhs); -//Field3D operator*(const Field3D& lhs, const Field2D& rhs); -//Field3D operator/(const Field3D& lhs, const Field2D& rhs); - #define FIELD3D_FIELD3D_FIELD2D_OP(OP_SYM, OP_TYPE) \ template \ std::enable_if_t && is_expr_field2d_v, \ @@ -727,11 +613,6 @@ FIELD3D_FIELD3D_FIELD2D_OP(-, Sub) FIELD3D_FIELD3D_FIELD2D_OP(*, Mul) FIELD3D_FIELD3D_FIELD2D_OP(/, Div) -//Field3D operator+(const Field3D& lhs, BoutReal rhs); -//Field3D operator-(const Field3D& lhs, BoutReal rhs); -//Field3D operator*(const Field3D& lhs, BoutReal rhs); -//Field3D operator/(const Field3D& lhs, BoutReal rhs); - #define FIELD3D_FIELD3D_BOUTREAL_OP(OP_SYM, OP_TYPE) \ template \ std::enable_if_t && is_expr_constant_v, \ @@ -754,11 +635,6 @@ FIELD3D_FIELD3D_BOUTREAL_OP(-, Sub) FIELD3D_FIELD3D_BOUTREAL_OP(*, Mul) FIELD3D_FIELD3D_BOUTREAL_OP(/, Div) -//Field3D operator+(BoutReal lhs, const Field3D& rhs); -//Field3D operator-(BoutReal lhs, const Field3D& rhs); -//Field3D operator*(BoutReal lhs, const Field3D& rhs); -//Field3D operator/(BoutReal lhs, const Field3D& rhs); - #define FIELD3D_BOUTREAL_FIELD3D_OP(OP_SYM, OP_TYPE) \ template \ std::enable_if_t && is_expr_field3d_v, \ diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index dca1605b91..c78a9ed7b7 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -6,148 +6,6 @@ #include #include -// Provide the C++ operator to update Field3D by multiplication with Field2D -Field3D& Field3D::operator*=(const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = fieldmesh->ind2Dto3D(index); - for (int jz = 0; jz < fieldmesh->LocalNz; ++jz) { - (*this)[base_ind + jz] *= rhs[index]; - } - } - - checkData(*this); - - } else { - (*this) = (*this) * rhs; - } - return *this; -} - -#if 0 -// Provide the C++ wrapper for addition of Field3D and Field2D -Field3D operator+(const Field3D& lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[base_ind + jz] + rhs[index]; - } - } - - checkData(result); - return result; -} -#endif - -// Provide the C++ operator to update Field3D by addition with Field2D -Field3D& Field3D::operator+=(const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = fieldmesh->ind2Dto3D(index); - for (int jz = 0; jz < fieldmesh->LocalNz; ++jz) { - (*this)[base_ind + jz] += rhs[index]; - } - } - - checkData(*this); - - } else { - (*this) = (*this) + rhs; - } - return *this; -} - -#if 0 -// Provide the C++ wrapper for subtraction of Field3D and Field2D -Field3D operator-(const Field3D& lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[base_ind + jz] - rhs[index]; - } - } - - checkData(result); - return result; -} -#endif - -// Provide the C++ operator to update Field3D by subtraction with Field2D -Field3D& Field3D::operator-=(const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = fieldmesh->ind2Dto3D(index); - for (int jz = 0; jz < fieldmesh->LocalNz; ++jz) { - (*this)[base_ind + jz] -= rhs[index]; - } - } - - checkData(*this); - - } else { - (*this) = (*this) - rhs; - } - return *this; -} - // Provide the C++ wrapper for multiplication of Field3D and FieldPerp FieldPerp operator*(const Field3D& lhs, const FieldPerp& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -233,211 +91,6 @@ FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs) { } #if 0 -// Provide the C++ wrapper for division of Field3D and BoutReal -Field3D operator/(const Field3D& lhs, const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - const auto tmp = 1.0 / rhs; - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * tmp; - } - - checkData(result); - return result; -} -#endif - -// Provide the C++ operator to update Field3D by division with BoutReal -Field3D& Field3D::operator/=(const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - const auto tmp = 1.0 / rhs; - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= tmp; } - - checkData(*this); - - } else { - (*this) = (*this) / rhs; - } - return *this; -} - -#if 0 -// Provide the C++ wrapper for addition of Field3D and BoutReal -Field3D operator+(const Field3D& lhs, const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] + rhs; - } - - checkData(result); - return result; -} -#endif - -// Provide the C++ operator to update Field3D by addition with BoutReal -Field3D& Field3D::operator+=(const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] += rhs; } - - checkData(*this); - - } else { - (*this) = (*this) + rhs; - } - return *this; -} - -#if 0 -// Provide the C++ wrapper for subtraction of Field3D and BoutReal -Field3D operator-(const Field3D& lhs, const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] - rhs; - } - - checkData(result); - return result; -} -#endif - -// Provide the C++ operator to update Field3D by subtraction with BoutReal -Field3D& Field3D::operator-=(const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] -= rhs; } - - checkData(*this); - - } else { - (*this) = (*this) - rhs; - } - return *this; -} - -// Provide the C++ wrapper for division of Field2D and Field3D -Field3D operator/(const Field2D& lhs, const Field3D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, lhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[index] / rhs[base_ind + jz]; - } - } - - checkData(result); - return result; -} - -// Provide the C++ wrapper for addition of Field2D and Field3D -Field3D operator+(const Field2D& lhs, const Field3D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, lhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[index] + rhs[base_ind + jz]; - } - } - - checkData(result); - return result; -} - -// Provide the C++ wrapper for subtraction of Field2D and Field3D -Field3D operator-(const Field2D& lhs, const Field3D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, lhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[index] - rhs[base_ind + jz]; - } - } - - checkData(result); - return result; -} - // Provide the C++ operator to update Field2D by division with Field2D Field2D& Field2D::operator/=(const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -458,6 +111,7 @@ Field2D& Field2D::operator/=(const Field2D& rhs) { } return *this; } +#endif // Provide the C++ wrapper for multiplication of Field2D and FieldPerp FieldPerp operator*(const Field2D& lhs, const FieldPerp& rhs) { @@ -543,22 +197,7 @@ FieldPerp operator-(const Field2D& lhs, const FieldPerp& rhs) { return result; } -// Provide the C++ wrapper for addition of Field2D and BoutReal -Field2D operator+(const Field2D& lhs, const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] + rhs; - } - - checkData(result); - return result; -} - +#if 0 // Provide the C++ operator to update Field2D by addition with BoutReal Field2D& Field2D::operator+=(const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -578,7 +217,9 @@ Field2D& Field2D::operator+=(const BoutReal rhs) { } return *this; } +#endif +#if 0 // Provide the C++ operator to update Field2D by subtraction with BoutReal Field2D& Field2D::operator-=(const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -598,6 +239,7 @@ Field2D& Field2D::operator-=(const BoutReal rhs) { } return *this; } +#endif // Provide the C++ wrapper for multiplication of FieldPerp and Field3D FieldPerp operator*(const FieldPerp& lhs, const Field3D& rhs) { @@ -1280,24 +922,6 @@ FieldPerp& FieldPerp::operator-=(const BoutReal rhs) { return *this; } -#if 0 -// Provide the C++ wrapper for addition of BoutReal and Field2D -Field2D operator+(const BoutReal lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field2D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs + rhs[index]; - } - - checkData(result); - return result; -} -#endif - // Provide the C++ wrapper for multiplication of BoutReal and FieldPerp FieldPerp operator*(const BoutReal lhs, const FieldPerp& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; From 56a8678fb275fd6035e99bb1d506f047817a6d8d Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Sun, 1 Jun 2025 06:30:58 -0700 Subject: [PATCH 17/58] Cleanup --- src/field/generated_fieldops.cxx | 67 -------------------------------- 1 file changed, 67 deletions(-) diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index c78a9ed7b7..022fedbd17 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -90,29 +90,6 @@ FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs) { return result; } -#if 0 -// Provide the C++ operator to update Field2D by division with Field2D -Field2D& Field2D::operator/=(const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] /= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) / rhs; - } - return *this; -} -#endif - // Provide the C++ wrapper for multiplication of Field2D and FieldPerp FieldPerp operator*(const Field2D& lhs, const FieldPerp& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -197,50 +174,6 @@ FieldPerp operator-(const Field2D& lhs, const FieldPerp& rhs) { return result; } -#if 0 -// Provide the C++ operator to update Field2D by addition with BoutReal -Field2D& Field2D::operator+=(const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] += rhs; } - - checkData(*this); - - } else { - (*this) = (*this) + rhs; - } - return *this; -} -#endif - -#if 0 -// Provide the C++ operator to update Field2D by subtraction with BoutReal -Field2D& Field2D::operator-=(const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] -= rhs; } - - checkData(*this); - - } else { - (*this) = (*this) - rhs; - } - return *this; -} -#endif - // Provide the C++ wrapper for multiplication of FieldPerp and Field3D FieldPerp operator*(const FieldPerp& lhs, const Field3D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; From 3e3a0ea3973f222a31091ae42c1c360ebb34da3f Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Mon, 2 Jun 2025 20:09:40 -0700 Subject: [PATCH 18/58] Add __host__ to make evaluator host-callable, remove offset --- include/bout/bout_types.hxx | 2 +- include/bout/field2d.hxx | 7 +++- include/bout/field3d.hxx | 11 ++--- include/bout/fieldops.hxx | 81 +++++++++++++++++++++++++------------ 4 files changed, 64 insertions(+), 37 deletions(-) diff --git a/include/bout/bout_types.hxx b/include/bout/bout_types.hxx index b2f38b61aa..c725c281d3 100644 --- a/include/bout/bout_types.hxx +++ b/include/bout/bout_types.hxx @@ -146,7 +146,7 @@ struct Constant { struct View { T v; View(T v) : v(v) {} - __device__ T operator()(int) const { return v; } + __host__ __device__ T operator()(int) const { return v; } }; operator View() const { return {val}; } }; diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index f430b98c6c..88f67f277b 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -108,6 +108,7 @@ public: || (is_expr_constant_v && is_expr_field2d_v) || (is_expr_field2d_v && is_expr_constant_v)>> Field2D(const BinaryExpr& expr) { + std::cout << "RUNNING Field2D constructor with CUDA\n"; Array data{expr.size()}; expr.evaluate(&data[0]); *this = std::move(Field2D{std::move(data), expr.getMesh(), expr.getLocation(), @@ -315,8 +316,10 @@ public: BoutReal* data; int mul = 1; int div = 1; - __device__ inline BoutReal operator()(int idx) const { return data[(idx*mul/div)]; } - __device__ inline BoutReal& operator[](int idx) const { + __host__ __device__ inline BoutReal operator()(int idx) const { + return data[(idx * mul / div)]; + } + __host__ __device__ inline BoutReal& operator[](int idx) const { return data[(idx * mul)/div]; } diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index fac2adc337..5056a4128b 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -429,12 +429,11 @@ public: BoutReal* data; int mul = 1; int div = 1; - int offset = 0; __host__ __device__ inline BoutReal operator()(int idx) const { - return data[(idx * mul) / div + offset]; + return data[(idx * mul) / div]; } - __device__ inline BoutReal& operator[](int idx) const { - return data[(idx * mul) / div + offset]; + __host__ __device__ inline BoutReal& operator[](int idx) const { + return data[(idx * mul) / div]; } View& setScale(int mul, int div) { @@ -442,10 +441,6 @@ public: this->div = div; return *this; } - View& setOffset(int o) { - offset = o; - return *this; - } }; operator View() { return View{&data[0]}; } operator View() const { return View{const_cast(&data[0])}; } diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 0de73d1205..9346b55e48 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -38,55 +38,58 @@ template struct is_expr_constant> : std::integral_constant>> {}; -constexpr int THREADS = 256; +constexpr int THREADS = 128; namespace bout { namespace op { struct Assign { int scale = 1; int offset = 0; template - __device__ void operator()(int idx, BoutReal* out, const Expr& expr) const { + __host__ __device__ void operator()(int idx, BoutReal* out, const Expr& expr) const { out[(idx * scale) + offset] = expr.lhs(idx) + expr.rhs(idx); } }; struct Add { template - __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, - const RView& R) const { + __host__ __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { return L(idx) + R(idx); } - __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { + __host__ __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { return a + b; } }; struct Sub { template - __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, - const RView& R) const { + __host__ __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { return L(idx) - R(idx); } - __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { + __host__ __device__ __forceinline__ BoutReal operator()(BoutReal a, + BoutReal b) const { return a - b; } }; struct Mul { template - __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, - const RView& R) const { + __host__ __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { return L(idx) * R(idx); } - __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { + __host__ __device__ __forceinline__ BoutReal operator()(BoutReal a, + BoutReal b) const { return a * b; } }; struct Div { template - __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, - const RView& R) const { + __host__ __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { return L(idx) / R(idx); } - __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { + __host__ __device__ __forceinline__ BoutReal operator()(BoutReal a, + BoutReal b) const { return a / b; } }; @@ -96,13 +99,24 @@ struct Add { template __global__ void __launch_bounds__(THREADS) evaluatorExpr(BoutReal* out, const Expr expr) { int tid = threadIdx.x + blockIdx.x * blockDim.x; - if (tid >= expr.size()) { + int e = expr.size(); + + // In-bounds version + //if (tid < e) { + // int idx = expr.regionIdx(tid); + // out[idx] = expr(idx); // single‐pass fusion + //} + + // Out-of-bounds version + if (tid >= e) { return; } int idx = expr.regionIdx(tid); out[idx] = expr(idx); // single‐pass fusion + + // Grid-strided loop //int stride = blockDim.x * gridDim.x; - //for (int i = tid, e = expr.size(); i < e; i += stride) { + //for (int i = tid; i < e; i += stride) { // int idx = expr.regionIdx(i); // out[idx] = expr(idx); // single‐pass fusion //} @@ -133,6 +147,18 @@ struct BinaryExpr { for (int i = 0; i < indices.size(); ++i) { indices[i] = region.getIndices()[i].ind; } + //std::cout << "===PRE-sorting indices\n"; + //for (auto& ind : indices) { + // std::cout << ind << " "; + //} + //std::cout << "===end PRE\n"; + //std::sort(indices.begin(), indices.end(), + // [](const auto& a, const auto& b) { return a < b; }); + //std::cout << "===POST-sorting indices\n"; + //for (auto& ind : indices) { + // std::cout << ind << " "; + //} + //std::cout << "===end POST\n"; //if (regionIndicesCache.find(static_cast(const_cast*>(®ion))) // != regionIndicesCache.end()) { // // If we have already computed the indices for this region, use them @@ -167,23 +193,19 @@ struct BinaryExpr { Func f; int mul = 1; int div = 1; - int offset = 0; View& setScale(int mul, int div) { this->mul = mul; this->div = div; return *this; } - View& setOffset(int o) { - offset = o; - return *this; + __host__ __device__ __forceinline__ int size() const { return num_indices; } + __host__ __device__ __forceinline__ int regionIdx(int idx) const { + return indices[idx]; } - - __device__ __forceinline__ int size() const { return num_indices; } - __device__ __forceinline__ int regionIdx(int idx) const { return indices[idx]; } - __device__ __forceinline__ BoutReal operator()(int idx) const { - return f((idx * mul) / div, lhs, rhs); // single‐pass fusion - //return f(lhs(idx), rhs(idx)); // single‐pass fusion + __host__ __device__ __forceinline__ BoutReal operator()(int idx) const { + //return f((idx * mul) / div, lhs, rhs); // single‐pass fusion + return f(lhs((idx * mul) / div), rhs((idx * mul) / div)); // single‐pass fusion } }; @@ -194,6 +216,13 @@ struct BinaryExpr { int blocks = (size() + THREADS - 1) / THREADS; evaluatorExpr<<>>(&data[0], static_cast(*this)); cudaDeviceSynchronize(); + // OpenMP impl. + //int e = size(); + //#pragma omp parallel for + //for (int i = 0; i < e; ++i) { + // int idx = regionIdx(i); + // data[idx] = operator()(idx); // single‐pass fusion + //} } Mesh* getMesh() const { return mesh; } From 56fe675a77441ec3f447b54d97ea49f7069e8375 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 4 Jun 2025 01:18:13 -0700 Subject: [PATCH 19/58] Update - Add field functions (sqrt, abs, etc.) - Add ResT template parameter to BinaryExpr for future use - Update some operators (min, max, etc.) to take as input a BinaryExpr and evaluate it before apply --- .../elm-pb-outerloop/elm_pb_outerloop.cxx | 8 +- include/bout/field.hxx | 52 +++++-- include/bout/field2d.hxx | 127 +++++++++--------- include/bout/field3d.hxx | 110 +++++++-------- include/bout/fieldops.hxx | 13 +- include/bout/fieldperp.hxx | 39 ++++++ src/field/field2d.cxx | 3 +- src/field/fieldperp.cxx | 2 +- src/mesh/coordinates.cxx | 2 +- src/physics/snb.cxx | 4 +- 10 files changed, 219 insertions(+), 141 deletions(-) diff --git a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx index 901e57bf97..96c28bab12 100644 --- a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx +++ b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx @@ -713,7 +713,7 @@ class ELMpb : public PhysicsModel { diamag_phi0 = false; K_H_term = false; } else { - Dphi0 = -D_min - 0.5 * D_0 * (1.0 - tanh(Field2D{D_s * (x - x0)})); + Dphi0 = -D_min - 0.5 * D_0 * (1.0 - tanh(D_s * (x - x0))); } if (sign < 0) { // change flow direction @@ -1031,7 +1031,7 @@ class ELMpb : public PhysicsModel { vacuum_trans *= pnorm; // Transitions from 0 in core to 1 in vacuum - Field2D tanh_res = tanh(Field2D{(P0 - vacuum_pressure) / vacuum_trans}); + Field2D tanh_res = tanh((P0 - vacuum_pressure) / vacuum_trans); vac_mask = (1.0 - tanh_res) / 2.0; if (spitzer_resist) { @@ -1283,7 +1283,7 @@ class ELMpb : public PhysicsModel { //////////////////////////////////////////// // Transitions from 0 in core to 1 in vacuum if (nonlinear) { - vac_mask = (1.0 - tanh(Field3D{((P0 + P) - vacuum_pressure) / vacuum_trans})) / 2.0; + vac_mask = (1.0 - tanh(((P0 + P) - vacuum_pressure) / vacuum_trans)) / 2.0; // Update resistivity if (spitzer_resist) { @@ -1794,7 +1794,7 @@ class ELMpb : public PhysicsModel { // Calculate coefficient. hyper_mu_x = hyperviscos * metric->g_11 * SQ(metric->dx) - * abs(Field3D{metric->g11 * D2DX2(U)}) / (abs(U) + 1e-3); + * abs(metric->g11 * D2DX2(U)) / (abs(U) + 1e-3); hyper_mu_x.applyBoundary("dirichlet"); // Set to zero on all boundaries ddt(U) += hyper_mu_x * metric->g11 * D2DX2(U); diff --git a/include/bout/field.hxx b/include/bout/field.hxx index 188d529ef0..fe2b4767d2 100644 --- a/include/bout/field.hxx +++ b/include/bout/field.hxx @@ -44,6 +44,8 @@ class Field; #include #include +#include "bout/fieldops.hxx" + class Mesh; /// Base class for scalar fields @@ -327,6 +329,12 @@ inline BoutReal min(const T& f, bool allpe = false, return result; } +template +inline BoutReal min(const BinaryExpr& f, bool allpe = false, + const std::string& rgn = "RGN_NOBNDRY") { + return min(ResT{f}, allpe, rgn); +} + /// Returns true if all elements of \p f over \p region are equal. By /// default only checks the local processor, use \p allpe to check /// globally @@ -412,6 +420,12 @@ inline BoutReal max(const T& f, bool allpe = false, return result; } +template +inline BoutReal max(const BinaryExpr& f, bool allpe = false, + const std::string& rgn = "RGN_NOBNDRY") { + return max(ResT{f}, allpe, rgn); +} + /// Mean of \p f, excluding the boundary/guard cells by default (can /// be changed with \p rgn argument). /// @@ -519,17 +533,33 @@ T pow(BoutReal lhs, const T& rhs, const std::string& rgn = "RGN_ALL") { #ifdef FIELD_FUNC #error This macro has already been defined #else -#define FIELD_FUNC(name, func) \ - template > \ - inline T name(const T& f, const std::string& rgn = "RGN_ALL") { \ - AUTO_TRACE(); \ - /* Check if the input is allocated */ \ - checkData(f); \ - /* Define and allocate the output result */ \ - T result{emptyFrom(f)}; \ - BOUT_FOR(d, result.getRegion(rgn)) { result[d] = func(f[d]); } \ - checkData(result); \ - return result; \ +#define FIELD_FUNC(name, func) \ + namespace bout::op { \ + struct name { \ + template \ + __host__ __device__ BoutReal operator()(int idx, const LView& L, \ + const RView& R) const { \ + return func(L(idx)); \ + } \ + }; \ + }; \ + template > \ + inline BinaryExpr name(const T& f, \ + const std::string& rgn = "RGN_ALL") { \ + std::cout << "RUNNING " #name " with CUDA\n"; \ + return BinaryExpr{static_cast(f), \ + static_cast(f), \ + bout::op::name{}, \ + f.getMesh(), \ + f.getLocation(), \ + f.getDirections(), \ + std::nullopt, \ + f.getRegion(rgn)}; \ + } \ + template \ + inline BinaryExpr name( \ + const BinaryExpr& f, const std::string& rgn = "RGN_ALL") { \ + return name(ResT{f}, rgn); \ } #endif diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index 88f67f277b..da8de551ad 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -47,8 +47,8 @@ class Field2D; class Field3D; class Mesh; -template -struct is_expr_field2d> +template +struct is_expr_field2d> : std::integral_constant> && is_expr_field2d_v>) || (is_expr_constant_v> @@ -103,11 +103,11 @@ public: ZDirectionType::Average}); template < - typename L, typename R, typename Func, + typename ResT, typename L, typename R, typename Func, typename = std::enable_if_t<(is_expr_field2d_v && is_expr_field2d_v) || (is_expr_constant_v && is_expr_field2d_v) || (is_expr_field2d_v && is_expr_constant_v)>> - Field2D(const BinaryExpr& expr) { + Field2D(const BinaryExpr& expr) { std::cout << "RUNNING Field2D constructor with CUDA\n"; Array data{expr.size()}; expr.evaluate(&data[0]); @@ -189,9 +189,9 @@ public: */ Field2D& operator=(BoutReal rhs); - template + template std::enable_if_t, Field2D&> - operator=(const BinaryExpr& expr) { + operator=(const BinaryExpr& expr) { std::cout << "RUNNING Field2D operator= with CUDA\n"; if (isAllocated()) { expr.evaluate(&data[0]); @@ -348,19 +348,20 @@ private: // Non-member overloaded operators -#define FIELD2D_FIELD2D_FIELD2D_OP(OP_SYM, OP_TYPE) \ - template \ - std::enable_if_t && is_expr_field2d_v, \ - BinaryExpr> operator OP_SYM(const L & lhs, \ - const R & rhs) { \ - return BinaryExpr{static_cast(lhs), \ - static_cast(rhs), \ - bout::op::OP_TYPE{}, \ - lhs.getMesh(), \ - lhs.getLocation(), \ - lhs.getDirections(), \ - std::nullopt, \ - lhs.getMesh()->getRegion2D("RGN_ALL")}; \ +#define FIELD2D_FIELD2D_FIELD2D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field2d_v, \ + BinaryExpr> \ + operator OP_SYM(const L & lhs, const R & rhs) { \ + return BinaryExpr{ \ + static_cast(lhs), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + std::nullopt, \ + lhs.getMesh()->getRegion2D("RGN_ALL")}; \ } FIELD2D_FIELD2D_FIELD2D_OP(+, Add) @@ -368,22 +369,22 @@ FIELD2D_FIELD2D_FIELD2D_OP(-, Sub) FIELD2D_FIELD2D_FIELD2D_OP(*, Mul) FIELD2D_FIELD2D_FIELD2D_OP(/, Div) -#define FIELD3D_FIELD2D_FIELD3D_OP(OP_SYM, OP_TYPE) \ - template \ - std::enable_if_t && is_expr_field3d_v, \ - BinaryExpr> operator OP_SYM(const L & lhs, \ - const R & rhs) { \ - auto regionID = rhs.getRegionID(); \ - int mesh_nz = rhs.getMesh()->LocalNz; \ - return BinaryExpr{ \ - static_cast(lhs).setScale(1, mesh_nz), \ - static_cast(rhs), \ - bout::op::OP_TYPE{}, \ - rhs.getMesh(), \ - rhs.getLocation(), \ - rhs.getDirections(), \ - regionID, \ - rhs.getMesh()->getRegion("RGN_ALL")}; \ +#define FIELD3D_FIELD2D_FIELD3D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field3d_v, \ + BinaryExpr> \ + operator OP_SYM(const L & lhs, const R & rhs) { \ + auto regionID = rhs.getRegionID(); \ + int mesh_nz = rhs.getMesh()->LocalNz; \ + return BinaryExpr{ \ + static_cast(lhs).setScale(1, mesh_nz), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + rhs.getMesh(), \ + rhs.getLocation(), \ + rhs.getDirections(), \ + regionID, \ + rhs.getMesh()->getRegion("RGN_ALL")}; \ } FIELD3D_FIELD2D_FIELD3D_OP(+, Add) @@ -391,20 +392,20 @@ FIELD3D_FIELD2D_FIELD3D_OP(-, Sub) FIELD3D_FIELD2D_FIELD3D_OP(*, Mul) FIELD3D_FIELD2D_FIELD3D_OP(/, Div) -#define FIELD2D_FIELD2D_BOUTREAL_OP(OP_SYM, OP_TYPE) \ - template \ - std::enable_if_t && is_expr_constant_v, \ - BinaryExpr, bout::op::OP_TYPE>> \ - operator OP_SYM(const L & lhs, R rhs) { \ - return BinaryExpr, bout::op::OP_TYPE>{ \ - static_cast(lhs), \ - static_cast::View>(rhs), \ - bout::op::OP_TYPE{}, \ - lhs.getMesh(), \ - lhs.getLocation(), \ - lhs.getDirections(), \ - std::nullopt, \ - lhs.getMesh()->getRegion2D("RGN_ALL")}; \ +#define FIELD2D_FIELD2D_BOUTREAL_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_constant_v, \ + BinaryExpr, bout::op::OP_TYPE>> \ + operator OP_SYM(const L & lhs, R rhs) { \ + return BinaryExpr, bout::op::OP_TYPE>{ \ + static_cast(lhs), \ + static_cast::View>(rhs), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + std::nullopt, \ + lhs.getMesh()->getRegion2D("RGN_ALL")}; \ } FIELD2D_FIELD2D_BOUTREAL_OP(+, Add) @@ -412,20 +413,20 @@ FIELD2D_FIELD2D_BOUTREAL_OP(-, Sub) FIELD2D_FIELD2D_BOUTREAL_OP(*, Mul) FIELD2D_FIELD2D_BOUTREAL_OP(/, Div) -#define FIELD2D_BOUTREAL_FIELD2D_OP(OP_SYM, OP_TYPE) \ - template \ - std::enable_if_t && is_expr_field2d_v, \ - BinaryExpr, R, bout::op::OP_TYPE>> \ - operator OP_SYM(L lhs, const R & rhs) { \ - return BinaryExpr, R, bout::op::OP_TYPE>{ \ - static_cast::View>(lhs), \ - static_cast(rhs), \ - bout::op::OP_TYPE{}, \ - rhs.getMesh(), \ - rhs.getLocation(), \ - rhs.getDirections(), \ - std::nullopt, \ - rhs.getMesh()->getRegion2D("RGN_ALL")}; \ +#define FIELD2D_BOUTREAL_FIELD2D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field2d_v, \ + BinaryExpr, R, bout::op::OP_TYPE>> \ + operator OP_SYM(L lhs, const R & rhs) { \ + return BinaryExpr, R, bout::op::OP_TYPE>{ \ + static_cast::View>(lhs), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + rhs.getMesh(), \ + rhs.getLocation(), \ + rhs.getDirections(), \ + std::nullopt, \ + rhs.getMesh()->getRegion2D("RGN_ALL")}; \ } FIELD2D_BOUTREAL_FIELD2D_OP(+, Add) diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 5056a4128b..62b299bc48 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -187,7 +187,7 @@ public: ZDirectionType::Standard}); template || is_expr_field3d_v>> - Field3D(const BinaryExpr& expr) { + Field3D(const BinaryExpr& expr) { //std::cout << "RUNNING constructor from BinaryExpr\n"; Array data{expr.size()}; expr.evaluate(&data[0]); @@ -457,9 +457,9 @@ public: /// return void, as only part initialised void operator=(const FieldPerp& rhs); Field3D& operator=(BoutReal val); - template + template std::enable_if_t, Field3D&> - operator=(BinaryExpr& expr) { + operator=(BinaryExpr& expr) { std::cout << "RUNNING operator= with CUDA\n"; regionID = expr.getRegionID(); if(isAllocated()) { @@ -565,10 +565,11 @@ FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); #define FIELD3D_FIELD3D_FIELD3D_OP(OP_SYM, OP_TYPE) \ template && is_expr_field3d_v>> \ - BinaryExpr operator OP_SYM(const L & lhs, const R & rhs) { \ + BinaryExpr operator OP_SYM(const L & lhs, \ + const R & rhs) { \ auto regionID = \ lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); \ - return BinaryExpr{ \ + return BinaryExpr{ \ static_cast(lhs), \ static_cast(rhs), \ bout::op::OP_TYPE{}, \ @@ -585,22 +586,22 @@ FIELD3D_FIELD3D_FIELD3D_OP(-, Sub) FIELD3D_FIELD3D_FIELD3D_OP(*, Mul) FIELD3D_FIELD3D_FIELD3D_OP(/, Div) -#define FIELD3D_FIELD3D_FIELD2D_OP(OP_SYM, OP_TYPE) \ - template \ - std::enable_if_t && is_expr_field2d_v, \ - BinaryExpr> operator OP_SYM(const L & lhs, \ - const R & rhs) { \ - auto regionID = lhs.getRegionID(); \ - int mesh_nz = lhs.getMesh()->LocalNz; \ - return BinaryExpr{ \ - static_cast(lhs), \ - static_cast(rhs).setScale(1, mesh_nz), \ - bout::op::OP_TYPE{}, \ - lhs.getMesh(), \ - lhs.getLocation(), \ - lhs.getDirections(), \ - regionID, \ - lhs.getMesh()->getRegion("RGN_ALL")}; \ +#define FIELD3D_FIELD3D_FIELD2D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field2d_v, \ + BinaryExpr> \ + operator OP_SYM(const L & lhs, const R & rhs) { \ + auto regionID = lhs.getRegionID(); \ + int mesh_nz = lhs.getMesh()->LocalNz; \ + return BinaryExpr{ \ + static_cast(lhs), \ + static_cast(rhs).setScale(1, mesh_nz), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + regionID, \ + lhs.getMesh()->getRegion("RGN_ALL")}; \ } FIELD3D_FIELD3D_FIELD2D_OP(+, Add) @@ -608,21 +609,21 @@ FIELD3D_FIELD3D_FIELD2D_OP(-, Sub) FIELD3D_FIELD3D_FIELD2D_OP(*, Mul) FIELD3D_FIELD3D_FIELD2D_OP(/, Div) -#define FIELD3D_FIELD3D_BOUTREAL_OP(OP_SYM, OP_TYPE) \ - template \ - std::enable_if_t && is_expr_constant_v, \ - BinaryExpr, bout::op::OP_TYPE>> \ - operator OP_SYM(const L & lhs, R rhs) { \ - auto regionID = lhs.getRegionID(); \ - return BinaryExpr, bout::op::OP_TYPE>{ \ - static_cast(lhs), \ - static_cast::View>(rhs), \ - bout::op::OP_TYPE{}, \ - lhs.getMesh(), \ - lhs.getLocation(), \ - lhs.getDirections(), \ - regionID, \ - lhs.getMesh()->getRegion("RGN_ALL")}; \ +#define FIELD3D_FIELD3D_BOUTREAL_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_constant_v, \ + BinaryExpr, bout::op::OP_TYPE>> \ + operator OP_SYM(const L & lhs, R rhs) { \ + auto regionID = lhs.getRegionID(); \ + return BinaryExpr, bout::op::OP_TYPE>{ \ + static_cast(lhs), \ + static_cast::View>(rhs), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + regionID, \ + lhs.getMesh()->getRegion("RGN_ALL")}; \ } FIELD3D_FIELD3D_BOUTREAL_OP(+, Add) @@ -630,21 +631,21 @@ FIELD3D_FIELD3D_BOUTREAL_OP(-, Sub) FIELD3D_FIELD3D_BOUTREAL_OP(*, Mul) FIELD3D_FIELD3D_BOUTREAL_OP(/, Div) -#define FIELD3D_BOUTREAL_FIELD3D_OP(OP_SYM, OP_TYPE) \ - template \ - std::enable_if_t && is_expr_field3d_v, \ - BinaryExpr, R, bout::op::OP_TYPE>> \ - operator OP_SYM(const L & lhs, const R & rhs) { \ - auto regionID = rhs.getRegionID(); \ - return BinaryExpr, R, bout::op::OP_TYPE>{ \ - static_cast::View>(lhs), \ - static_cast(rhs), \ - bout::op::OP_TYPE{}, \ - rhs.getMesh(), \ - rhs.getLocation(), \ - rhs.getDirections(), \ - regionID, \ - rhs.getMesh()->getRegion("RGN_ALL")}; \ +#define FIELD3D_BOUTREAL_FIELD3D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field3d_v, \ + BinaryExpr, R, bout::op::OP_TYPE>> \ + operator OP_SYM(const L & lhs, const R & rhs) { \ + auto regionID = rhs.getRegionID(); \ + return BinaryExpr, R, bout::op::OP_TYPE>{ \ + static_cast::View>(lhs), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + rhs.getMesh(), \ + rhs.getLocation(), \ + rhs.getDirections(), \ + regionID, \ + rhs.getMesh()->getRegion("RGN_ALL")}; \ } FIELD3D_BOUTREAL_FIELD3D_OP(+, Add) @@ -771,8 +772,9 @@ struct is_expr_field3d : std::true_type {}; template <> struct is_expr_field2d : std::true_type {}; -template -struct is_expr_field3d> - : std::integral_constant>::value || is_expr_field3d_v>> {}; +template +struct is_expr_field3d> + : std::integral_constant>::value + || is_expr_field3d_v>> {}; #endif /* BOUT_FIELD3D_H */ diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 9346b55e48..48d104e3ea 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -24,6 +24,12 @@ inline constexpr bool is_expr_field2d_v = is_expr_field2d>::valu template struct is_expr_field3d : std::false_type {}; +template +struct is_expr_fieldperp : std::false_type {}; + +template +inline constexpr bool is_expr_fieldperp_v = is_expr_fieldperp>::value; + // Helper variable template template inline constexpr bool is_expr_field3d_v = is_expr_field3d>::value; @@ -124,7 +130,7 @@ __global__ void __launch_bounds__(THREADS) evaluatorExpr(BoutReal* out, const Ex inline std::unordered_map> regionIndicesCache; -template +template struct BinaryExpr { typename L::View lhs; typename R::View rhs; @@ -185,6 +191,7 @@ struct BinaryExpr { } inline int regionIdx(int idx) const { return indices[idx]; } + //operator ResT() { return ResT{*this}; } struct View { typename L::View lhs; typename R::View rhs; @@ -204,8 +211,8 @@ struct BinaryExpr { return indices[idx]; } __host__ __device__ __forceinline__ BoutReal operator()(int idx) const { - //return f((idx * mul) / div, lhs, rhs); // single‐pass fusion - return f(lhs((idx * mul) / div), rhs((idx * mul) / div)); // single‐pass fusion + return f((idx * mul) / div, lhs, rhs); // single‐pass fusion + //return f(lhs((idx * mul) / div), rhs((idx * mul) / div)); // single‐pass fusion } }; diff --git a/include/bout/fieldperp.hxx b/include/bout/fieldperp.hxx index 6995308dbe..49d4fec1b7 100644 --- a/include/bout/fieldperp.hxx +++ b/include/bout/fieldperp.hxx @@ -86,6 +86,17 @@ public: DirectionTypes directions_in = {YDirectionType::Standard, ZDirectionType::Standard}); + template < + typename ResT, typename L, typename R, typename Func, + typename = std::enable_if_t<(is_expr_fieldperp_v && is_expr_fieldperp_v)>> + FieldPerp(const BinaryExpr& expr) { + std::cout << "RUNNING FieldPerp constructor with CUDA\n"; + Array data{expr.size()}; + expr.evaluate(&data[0]); + *this = std::move(FieldPerp{std::move(data), expr.getMesh(), expr.getLocation(), + /* yindex */ -1, expr.getDirections()}); + } + ~FieldPerp() override = default; /*! @@ -292,6 +303,26 @@ public: int size() const override { return nx * nz; }; + struct View { + BoutReal* data; + int mul = 1; + int div = 1; + __host__ __device__ inline BoutReal operator()(int idx) const { + return data[(idx * mul) / div]; + } + __host__ __device__ inline BoutReal& operator[](int idx) const { + return data[(idx * mul) / div]; + } + + View& setScale(int mul, int div) { + this->mul = mul; + this->div = div; + return *this; + } + }; + operator View() { return View{&data[0]}; } + operator View() const { return View{const_cast(&data[0])}; } + private: /// The Y index at which this FieldPerp is defined int yindex{-1}; @@ -379,4 +410,12 @@ bool operator==(const FieldPerp& a, const FieldPerp& b); /// Output a string describing a FieldPerp to a stream std::ostream& operator<<(std::ostream& out, const FieldPerp& value); +template <> +struct is_expr_fieldperp : std::true_type {}; + +template +struct is_expr_fieldperp> + : std::integral_constant> + && is_expr_fieldperp_v>> {}; + #endif diff --git a/src/field/field2d.cxx b/src/field/field2d.cxx index e5c1d466b7..c8b9ebb689 100644 --- a/src/field/field2d.cxx +++ b/src/field/field2d.cxx @@ -389,8 +389,7 @@ bool operator==(const Field2D& a, const Field2D& b) { if (!a.isAllocated() || !b.isAllocated()) { return false; } - Field2D diff = a - b; - return min(abs(diff)) < 1e-10; + return min(abs(a - b)) < 1e-10; } std::ostream& operator<<(std::ostream& out, const Field2D& value) { diff --git a/src/field/fieldperp.cxx b/src/field/fieldperp.cxx index ca9bdc0397..9578aa0d9d 100644 --- a/src/field/fieldperp.cxx +++ b/src/field/fieldperp.cxx @@ -209,7 +209,7 @@ bool operator==(const FieldPerp& a, const FieldPerp& b) { if (!a.isAllocated() || !b.isAllocated()) { return false; } - return (a.getIndex() == b.getIndex()) and (min(abs(a - b)) < 1e-10); + return (a.getIndex() == b.getIndex()) and (min(FieldPerp{abs(a - b)}) < 1e-10); } std::ostream& operator<<(std::ostream& out, const FieldPerp& value) { diff --git a/src/mesh/coordinates.cxx b/src/mesh/coordinates.cxx index b8fd33c019..0139d21fd7 100644 --- a/src/mesh/coordinates.cxx +++ b/src/mesh/coordinates.cxx @@ -1349,7 +1349,7 @@ int Coordinates::jacobian() { // Check that g is positive bout::checkPositive(g, "The determinant of g^ij", "RGN_NOBNDRY"); - J = 1. / sqrt(Field2D{g}); + J = 1. / sqrt(g); // More robust to extrapolate derived quantities directly, rather than // deriving from extrapolated covariant metric components J = interpolateAndExtrapolate(J, location, extrapolate_x, extrapolate_y, false, diff --git a/src/physics/snb.cxx b/src/physics/snb.cxx index f21bfb7ee0..80da9e1bf8 100644 --- a/src/physics/snb.cxx +++ b/src/physics/snb.cxx @@ -11,10 +11,10 @@ namespace bout { Field3D HeatFluxSNB::divHeatFlux(const Field3D& Te, const Field3D& Ne, Field3D* Div_Q_SH_out) { - Field3D thermal_speed = sqrt(Field3D{2. * SI::qe * Te / SI::Me}); + Field3D thermal_speed = sqrt(2. * SI::qe * Te / SI::Me); BoutReal Y = SQ(SQ(SI::qe) / (SI::e0 * SI::Me)) / (4 * PI); - Field3D coulomb_log = 6.6 - 0.5 * log(Field3D{Ne * 1e-20}) + 1.5 * log(Te); + Field3D coulomb_log = 6.6 - 0.5 * log(Ne * 1e-20) + 1.5 * log(Te); // Thermal electron-electron mean free path [m] Field3D lambda_ee_T = pow(thermal_speed, 4) / (Y * Ne * coulomb_log); From b75103dcaf716636987e711dd78defa78a544ddb Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 24 Oct 2025 03:43:11 -0700 Subject: [PATCH 20/58] Add FFT GPU shiftZ --- include/bout/twiddle.hxx | 2046 +++++++++++++++++++++++++++ src/mesh/parallel/shiftedmetric.cxx | 325 ++++- 2 files changed, 2367 insertions(+), 4 deletions(-) create mode 100644 include/bout/twiddle.hxx diff --git a/include/bout/twiddle.hxx b/include/bout/twiddle.hxx new file mode 100644 index 0000000000..ae4f729b48 --- /dev/null +++ b/include/bout/twiddle.hxx @@ -0,0 +1,2046 @@ +__constant__ double2 c_twiddle_fwd_16[16] = { + {1.0000000000000000, -0.0000000000000000}, // k=0 + {0.9238795325112867, -0.3826834323650898}, // k=1 + {0.7071067811865476, -0.7071067811865475}, // k=2 + {0.3826834323650898, -0.9238795325112867}, // k=3 + {0.0000000000000001, -1.0000000000000000}, // k=4 + {-0.3826834323650897, -0.9238795325112867}, // k=5 + {-0.7071067811865475, -0.7071067811865476}, // k=6 + {-0.9238795325112867, -0.3826834323650899}, // k=7 + {-1.0000000000000000, -0.0000000000000001}, // k=8 + {-0.9238795325112868, 0.3826834323650897}, // k=9 + {-0.7071067811865477, 0.7071067811865475}, // k=10 + {-0.3826834323650903, 0.9238795325112865}, // k=11 + {-0.0000000000000002, 1.0000000000000000}, // k=12 + {0.3826834323650900, 0.9238795325112866}, // k=13 + {0.7071067811865474, 0.7071067811865477}, // k=14 + {0.9238795325112865, 0.3826834323650904}, // k=15 +}; + +__constant__ double2 c_twiddle_inv_16[16] = { + {1.0000000000000000, 0.0000000000000000}, // k=0 + {0.9238795325112867, 0.3826834323650898}, // k=1 + {0.7071067811865476, 0.7071067811865475}, // k=2 + {0.3826834323650898, 0.9238795325112867}, // k=3 + {0.0000000000000001, 1.0000000000000000}, // k=4 + {-0.3826834323650897, 0.9238795325112867}, // k=5 + {-0.7071067811865475, 0.7071067811865476}, // k=6 + {-0.9238795325112867, 0.3826834323650899}, // k=7 + {-1.0000000000000000, 0.0000000000000001}, // k=8 + {-0.9238795325112868, -0.3826834323650897}, // k=9 + {-0.7071067811865477, -0.7071067811865475}, // k=10 + {-0.3826834323650903, -0.9238795325112865}, // k=11 + {-0.0000000000000002, -1.0000000000000000}, // k=12 + {0.3826834323650900, -0.9238795325112866}, // k=13 + {0.7071067811865474, -0.7071067811865477}, // k=14 + {0.9238795325112865, -0.3826834323650904}, // k=15 +}; +__constant__ double2 c_twiddle_fwd_32[32] = { + {1.0000000000000000, -0.0000000000000000}, // k=0 + {0.9807852804032304, -0.1950903220161282}, // k=1 + {0.9238795325112867, -0.3826834323650898}, // k=2 + {0.8314696123025452, -0.5555702330196022}, // k=3 + {0.7071067811865476, -0.7071067811865475}, // k=4 + {0.5555702330196023, -0.8314696123025452}, // k=5 + {0.3826834323650898, -0.9238795325112867}, // k=6 + {0.1950903220161283, -0.9807852804032304}, // k=7 + {0.0000000000000001, -1.0000000000000000}, // k=8 + {-0.1950903220161282, -0.9807852804032304}, // k=9 + {-0.3826834323650897, -0.9238795325112867}, // k=10 + {-0.5555702330196020, -0.8314696123025455}, // k=11 + {-0.7071067811865475, -0.7071067811865476}, // k=12 + {-0.8314696123025453, -0.5555702330196022}, // k=13 + {-0.9238795325112867, -0.3826834323650899}, // k=14 + {-0.9807852804032304, -0.1950903220161286}, // k=15 + {-1.0000000000000000, -0.0000000000000001}, // k=16 + {-0.9807852804032304, 0.1950903220161284}, // k=17 + {-0.9238795325112868, 0.3826834323650897}, // k=18 + {-0.8314696123025455, 0.5555702330196020}, // k=19 + {-0.7071067811865477, 0.7071067811865475}, // k=20 + {-0.5555702330196022, 0.8314696123025452}, // k=21 + {-0.3826834323650903, 0.9238795325112865}, // k=22 + {-0.1950903220161287, 0.9807852804032303}, // k=23 + {-0.0000000000000002, 1.0000000000000000}, // k=24 + {0.1950903220161283, 0.9807852804032304}, // k=25 + {0.3826834323650900, 0.9238795325112866}, // k=26 + {0.5555702330196018, 0.8314696123025455}, // k=27 + {0.7071067811865474, 0.7071067811865477}, // k=28 + {0.8314696123025452, 0.5555702330196022}, // k=29 + {0.9238795325112865, 0.3826834323650904}, // k=30 + {0.9807852804032303, 0.1950903220161287}, // k=31 +}; + +__constant__ double2 c_twiddle_inv_32[32] = { + {1.0000000000000000, 0.0000000000000000}, // k=0 + {0.9807852804032304, 0.1950903220161282}, // k=1 + {0.9238795325112867, 0.3826834323650898}, // k=2 + {0.8314696123025452, 0.5555702330196022}, // k=3 + {0.7071067811865476, 0.7071067811865475}, // k=4 + {0.5555702330196023, 0.8314696123025452}, // k=5 + {0.3826834323650898, 0.9238795325112867}, // k=6 + {0.1950903220161283, 0.9807852804032304}, // k=7 + {0.0000000000000001, 1.0000000000000000}, // k=8 + {-0.1950903220161282, 0.9807852804032304}, // k=9 + {-0.3826834323650897, 0.9238795325112867}, // k=10 + {-0.5555702330196020, 0.8314696123025455}, // k=11 + {-0.7071067811865475, 0.7071067811865476}, // k=12 + {-0.8314696123025453, 0.5555702330196022}, // k=13 + {-0.9238795325112867, 0.3826834323650899}, // k=14 + {-0.9807852804032304, 0.1950903220161286}, // k=15 + {-1.0000000000000000, 0.0000000000000001}, // k=16 + {-0.9807852804032304, -0.1950903220161284}, // k=17 + {-0.9238795325112868, -0.3826834323650897}, // k=18 + {-0.8314696123025455, -0.5555702330196020}, // k=19 + {-0.7071067811865477, -0.7071067811865475}, // k=20 + {-0.5555702330196022, -0.8314696123025452}, // k=21 + {-0.3826834323650903, -0.9238795325112865}, // k=22 + {-0.1950903220161287, -0.9807852804032303}, // k=23 + {-0.0000000000000002, -1.0000000000000000}, // k=24 + {0.1950903220161283, -0.9807852804032304}, // k=25 + {0.3826834323650900, -0.9238795325112866}, // k=26 + {0.5555702330196018, -0.8314696123025455}, // k=27 + {0.7071067811865474, -0.7071067811865477}, // k=28 + {0.8314696123025452, -0.5555702330196022}, // k=29 + {0.9238795325112865, -0.3826834323650904}, // k=30 + {0.9807852804032303, -0.1950903220161287}, // k=31 +}; +__constant__ double2 c_twiddle_fwd_64[64] = { + {1.0000000000000000, -0.0000000000000000}, // k=0 + {0.9951847266721969, -0.0980171403295606}, // k=1 + {0.9807852804032304, -0.1950903220161282}, // k=2 + {0.9569403357322088, -0.2902846772544623}, // k=3 + {0.9238795325112867, -0.3826834323650898}, // k=4 + {0.8819212643483550, -0.4713967368259976}, // k=5 + {0.8314696123025452, -0.5555702330196022}, // k=6 + {0.7730104533627370, -0.6343932841636455}, // k=7 + {0.7071067811865476, -0.7071067811865475}, // k=8 + {0.6343932841636455, -0.7730104533627370}, // k=9 + {0.5555702330196023, -0.8314696123025452}, // k=10 + {0.4713967368259978, -0.8819212643483549}, // k=11 + {0.3826834323650898, -0.9238795325112867}, // k=12 + {0.2902846772544623, -0.9569403357322089}, // k=13 + {0.1950903220161283, -0.9807852804032304}, // k=14 + {0.0980171403295608, -0.9951847266721968}, // k=15 + {0.0000000000000001, -1.0000000000000000}, // k=16 + {-0.0980171403295606, -0.9951847266721969}, // k=17 + {-0.1950903220161282, -0.9807852804032304}, // k=18 + {-0.2902846772544622, -0.9569403357322089}, // k=19 + {-0.3826834323650897, -0.9238795325112867}, // k=20 + {-0.4713967368259977, -0.8819212643483550}, // k=21 + {-0.5555702330196020, -0.8314696123025455}, // k=22 + {-0.6343932841636454, -0.7730104533627371}, // k=23 + {-0.7071067811865475, -0.7071067811865476}, // k=24 + {-0.7730104533627370, -0.6343932841636455}, // k=25 + {-0.8314696123025453, -0.5555702330196022}, // k=26 + {-0.8819212643483549, -0.4713967368259979}, // k=27 + {-0.9238795325112867, -0.3826834323650899}, // k=28 + {-0.9569403357322088, -0.2902846772544624}, // k=29 + {-0.9807852804032304, -0.1950903220161286}, // k=30 + {-0.9951847266721968, -0.0980171403295608}, // k=31 + {-1.0000000000000000, -0.0000000000000001}, // k=32 + {-0.9951847266721969, 0.0980171403295606}, // k=33 + {-0.9807852804032304, 0.1950903220161284}, // k=34 + {-0.9569403357322089, 0.2902846772544621}, // k=35 + {-0.9238795325112868, 0.3826834323650897}, // k=36 + {-0.8819212643483550, 0.4713967368259976}, // k=37 + {-0.8314696123025455, 0.5555702330196020}, // k=38 + {-0.7730104533627371, 0.6343932841636453}, // k=39 + {-0.7071067811865477, 0.7071067811865475}, // k=40 + {-0.6343932841636459, 0.7730104533627367}, // k=41 + {-0.5555702330196022, 0.8314696123025452}, // k=42 + {-0.4713967368259979, 0.8819212643483549}, // k=43 + {-0.3826834323650903, 0.9238795325112865}, // k=44 + {-0.2902846772544624, 0.9569403357322088}, // k=45 + {-0.1950903220161287, 0.9807852804032303}, // k=46 + {-0.0980171403295605, 0.9951847266721969}, // k=47 + {-0.0000000000000002, 1.0000000000000000}, // k=48 + {0.0980171403295601, 0.9951847266721969}, // k=49 + {0.1950903220161283, 0.9807852804032304}, // k=50 + {0.2902846772544621, 0.9569403357322089}, // k=51 + {0.3826834323650900, 0.9238795325112866}, // k=52 + {0.4713967368259976, 0.8819212643483550}, // k=53 + {0.5555702330196018, 0.8314696123025455}, // k=54 + {0.6343932841636456, 0.7730104533627369}, // k=55 + {0.7071067811865474, 0.7071067811865477}, // k=56 + {0.7730104533627367, 0.6343932841636459}, // k=57 + {0.8314696123025452, 0.5555702330196022}, // k=58 + {0.8819212643483548, 0.4713967368259979}, // k=59 + {0.9238795325112865, 0.3826834323650904}, // k=60 + {0.9569403357322088, 0.2902846772544625}, // k=61 + {0.9807852804032303, 0.1950903220161287}, // k=62 + {0.9951847266721969, 0.0980171403295605}, // k=63 +}; + +__constant__ double2 c_twiddle_inv_64[64] = { + {1.0000000000000000, 0.0000000000000000}, // k=0 + {0.9951847266721969, 0.0980171403295606}, // k=1 + {0.9807852804032304, 0.1950903220161282}, // k=2 + {0.9569403357322088, 0.2902846772544623}, // k=3 + {0.9238795325112867, 0.3826834323650898}, // k=4 + {0.8819212643483550, 0.4713967368259976}, // k=5 + {0.8314696123025452, 0.5555702330196022}, // k=6 + {0.7730104533627370, 0.6343932841636455}, // k=7 + {0.7071067811865476, 0.7071067811865475}, // k=8 + {0.6343932841636455, 0.7730104533627370}, // k=9 + {0.5555702330196023, 0.8314696123025452}, // k=10 + {0.4713967368259978, 0.8819212643483549}, // k=11 + {0.3826834323650898, 0.9238795325112867}, // k=12 + {0.2902846772544623, 0.9569403357322089}, // k=13 + {0.1950903220161283, 0.9807852804032304}, // k=14 + {0.0980171403295608, 0.9951847266721968}, // k=15 + {0.0000000000000001, 1.0000000000000000}, // k=16 + {-0.0980171403295606, 0.9951847266721969}, // k=17 + {-0.1950903220161282, 0.9807852804032304}, // k=18 + {-0.2902846772544622, 0.9569403357322089}, // k=19 + {-0.3826834323650897, 0.9238795325112867}, // k=20 + {-0.4713967368259977, 0.8819212643483550}, // k=21 + {-0.5555702330196020, 0.8314696123025455}, // k=22 + {-0.6343932841636454, 0.7730104533627371}, // k=23 + {-0.7071067811865475, 0.7071067811865476}, // k=24 + {-0.7730104533627370, 0.6343932841636455}, // k=25 + {-0.8314696123025453, 0.5555702330196022}, // k=26 + {-0.8819212643483549, 0.4713967368259979}, // k=27 + {-0.9238795325112867, 0.3826834323650899}, // k=28 + {-0.9569403357322088, 0.2902846772544624}, // k=29 + {-0.9807852804032304, 0.1950903220161286}, // k=30 + {-0.9951847266721968, 0.0980171403295608}, // k=31 + {-1.0000000000000000, 0.0000000000000001}, // k=32 + {-0.9951847266721969, -0.0980171403295606}, // k=33 + {-0.9807852804032304, -0.1950903220161284}, // k=34 + {-0.9569403357322089, -0.2902846772544621}, // k=35 + {-0.9238795325112868, -0.3826834323650897}, // k=36 + {-0.8819212643483550, -0.4713967368259976}, // k=37 + {-0.8314696123025455, -0.5555702330196020}, // k=38 + {-0.7730104533627371, -0.6343932841636453}, // k=39 + {-0.7071067811865477, -0.7071067811865475}, // k=40 + {-0.6343932841636459, -0.7730104533627367}, // k=41 + {-0.5555702330196022, -0.8314696123025452}, // k=42 + {-0.4713967368259979, -0.8819212643483549}, // k=43 + {-0.3826834323650903, -0.9238795325112865}, // k=44 + {-0.2902846772544624, -0.9569403357322088}, // k=45 + {-0.1950903220161287, -0.9807852804032303}, // k=46 + {-0.0980171403295605, -0.9951847266721969}, // k=47 + {-0.0000000000000002, -1.0000000000000000}, // k=48 + {0.0980171403295601, -0.9951847266721969}, // k=49 + {0.1950903220161283, -0.9807852804032304}, // k=50 + {0.2902846772544621, -0.9569403357322089}, // k=51 + {0.3826834323650900, -0.9238795325112866}, // k=52 + {0.4713967368259976, -0.8819212643483550}, // k=53 + {0.5555702330196018, -0.8314696123025455}, // k=54 + {0.6343932841636456, -0.7730104533627369}, // k=55 + {0.7071067811865474, -0.7071067811865477}, // k=56 + {0.7730104533627367, -0.6343932841636459}, // k=57 + {0.8314696123025452, -0.5555702330196022}, // k=58 + {0.8819212643483548, -0.4713967368259979}, // k=59 + {0.9238795325112865, -0.3826834323650904}, // k=60 + {0.9569403357322088, -0.2902846772544625}, // k=61 + {0.9807852804032303, -0.1950903220161287}, // k=62 + {0.9951847266721969, -0.0980171403295605}, // k=63 +}; +__constant__ double2 c_twiddle_fwd_128[128] = { + {1.0000000000000000, -0.0000000000000000}, // k=0 + {0.9987954562051724, -0.0490676743274180}, // k=1 + {0.9951847266721969, -0.0980171403295606}, // k=2 + {0.9891765099647810, -0.1467304744553617}, // k=3 + {0.9807852804032304, -0.1950903220161282}, // k=4 + {0.9700312531945440, -0.2429801799032639}, // k=5 + {0.9569403357322088, -0.2902846772544623}, // k=6 + {0.9415440651830208, -0.3368898533922201}, // k=7 + {0.9238795325112867, -0.3826834323650898}, // k=8 + {0.9039892931234433, -0.4275550934302821}, // k=9 + {0.8819212643483550, -0.4713967368259976}, // k=10 + {0.8577286100002721, -0.5141027441932217}, // k=11 + {0.8314696123025452, -0.5555702330196022}, // k=12 + {0.8032075314806449, -0.5956993044924334}, // k=13 + {0.7730104533627370, -0.6343932841636455}, // k=14 + {0.7409511253549591, -0.6715589548470183}, // k=15 + {0.7071067811865476, -0.7071067811865475}, // k=16 + {0.6715589548470183, -0.7409511253549591}, // k=17 + {0.6343932841636455, -0.7730104533627370}, // k=18 + {0.5956993044924335, -0.8032075314806448}, // k=19 + {0.5555702330196023, -0.8314696123025452}, // k=20 + {0.5141027441932217, -0.8577286100002721}, // k=21 + {0.4713967368259978, -0.8819212643483549}, // k=22 + {0.4275550934302822, -0.9039892931234433}, // k=23 + {0.3826834323650898, -0.9238795325112867}, // k=24 + {0.3368898533922201, -0.9415440651830208}, // k=25 + {0.2902846772544623, -0.9569403357322089}, // k=26 + {0.2429801799032640, -0.9700312531945440}, // k=27 + {0.1950903220161283, -0.9807852804032304}, // k=28 + {0.1467304744553617, -0.9891765099647810}, // k=29 + {0.0980171403295608, -0.9951847266721968}, // k=30 + {0.0490676743274181, -0.9987954562051724}, // k=31 + {0.0000000000000001, -1.0000000000000000}, // k=32 + {-0.0490676743274180, -0.9987954562051724}, // k=33 + {-0.0980171403295606, -0.9951847266721969}, // k=34 + {-0.1467304744553616, -0.9891765099647810}, // k=35 + {-0.1950903220161282, -0.9807852804032304}, // k=36 + {-0.2429801799032639, -0.9700312531945440}, // k=37 + {-0.2902846772544622, -0.9569403357322089}, // k=38 + {-0.3368898533922199, -0.9415440651830208}, // k=39 + {-0.3826834323650897, -0.9238795325112867}, // k=40 + {-0.4275550934302819, -0.9039892931234434}, // k=41 + {-0.4713967368259977, -0.8819212643483550}, // k=42 + {-0.5141027441932217, -0.8577286100002721}, // k=43 + {-0.5555702330196020, -0.8314696123025455}, // k=44 + {-0.5956993044924334, -0.8032075314806449}, // k=45 + {-0.6343932841636454, -0.7730104533627371}, // k=46 + {-0.6715589548470184, -0.7409511253549590}, // k=47 + {-0.7071067811865475, -0.7071067811865476}, // k=48 + {-0.7409511253549589, -0.6715589548470186}, // k=49 + {-0.7730104533627370, -0.6343932841636455}, // k=50 + {-0.8032075314806448, -0.5956993044924335}, // k=51 + {-0.8314696123025453, -0.5555702330196022}, // k=52 + {-0.8577286100002720, -0.5141027441932218}, // k=53 + {-0.8819212643483549, -0.4713967368259979}, // k=54 + {-0.9039892931234433, -0.4275550934302820}, // k=55 + {-0.9238795325112867, -0.3826834323650899}, // k=56 + {-0.9415440651830207, -0.3368898533922203}, // k=57 + {-0.9569403357322088, -0.2902846772544624}, // k=58 + {-0.9700312531945440, -0.2429801799032641}, // k=59 + {-0.9807852804032304, -0.1950903220161286}, // k=60 + {-0.9891765099647810, -0.1467304744553618}, // k=61 + {-0.9951847266721968, -0.0980171403295608}, // k=62 + {-0.9987954562051724, -0.0490676743274180}, // k=63 + {-1.0000000000000000, -0.0000000000000001}, // k=64 + {-0.9987954562051724, 0.0490676743274177}, // k=65 + {-0.9951847266721969, 0.0980171403295606}, // k=66 + {-0.9891765099647810, 0.1467304744553616}, // k=67 + {-0.9807852804032304, 0.1950903220161284}, // k=68 + {-0.9700312531945440, 0.2429801799032638}, // k=69 + {-0.9569403357322089, 0.2902846772544621}, // k=70 + {-0.9415440651830208, 0.3368898533922201}, // k=71 + {-0.9238795325112868, 0.3826834323650897}, // k=72 + {-0.9039892931234434, 0.4275550934302818}, // k=73 + {-0.8819212643483550, 0.4713967368259976}, // k=74 + {-0.8577286100002721, 0.5141027441932216}, // k=75 + {-0.8314696123025455, 0.5555702330196020}, // k=76 + {-0.8032075314806449, 0.5956993044924332}, // k=77 + {-0.7730104533627371, 0.6343932841636453}, // k=78 + {-0.7409511253549591, 0.6715589548470184}, // k=79 + {-0.7071067811865477, 0.7071067811865475}, // k=80 + {-0.6715589548470187, 0.7409511253549589}, // k=81 + {-0.6343932841636459, 0.7730104533627367}, // k=82 + {-0.5956993044924331, 0.8032075314806451}, // k=83 + {-0.5555702330196022, 0.8314696123025452}, // k=84 + {-0.5141027441932218, 0.8577286100002720}, // k=85 + {-0.4713967368259979, 0.8819212643483549}, // k=86 + {-0.4275550934302825, 0.9039892931234431}, // k=87 + {-0.3826834323650903, 0.9238795325112865}, // k=88 + {-0.3368898533922199, 0.9415440651830208}, // k=89 + {-0.2902846772544624, 0.9569403357322088}, // k=90 + {-0.2429801799032641, 0.9700312531945440}, // k=91 + {-0.1950903220161287, 0.9807852804032303}, // k=92 + {-0.1467304744553623, 0.9891765099647809}, // k=93 + {-0.0980171403295605, 0.9951847266721969}, // k=94 + {-0.0490676743274180, 0.9987954562051724}, // k=95 + {-0.0000000000000002, 1.0000000000000000}, // k=96 + {0.0490676743274177, 0.9987954562051724}, // k=97 + {0.0980171403295601, 0.9951847266721969}, // k=98 + {0.1467304744553619, 0.9891765099647809}, // k=99 + {0.1950903220161283, 0.9807852804032304}, // k=100 + {0.2429801799032638, 0.9700312531945440}, // k=101 + {0.2902846772544621, 0.9569403357322089}, // k=102 + {0.3368898533922196, 0.9415440651830209}, // k=103 + {0.3826834323650900, 0.9238795325112866}, // k=104 + {0.4275550934302821, 0.9039892931234433}, // k=105 + {0.4713967368259976, 0.8819212643483550}, // k=106 + {0.5141027441932216, 0.8577286100002722}, // k=107 + {0.5555702330196018, 0.8314696123025455}, // k=108 + {0.5956993044924329, 0.8032075314806453}, // k=109 + {0.6343932841636456, 0.7730104533627369}, // k=110 + {0.6715589548470183, 0.7409511253549591}, // k=111 + {0.7071067811865474, 0.7071067811865477}, // k=112 + {0.7409511253549589, 0.6715589548470187}, // k=113 + {0.7730104533627367, 0.6343932841636459}, // k=114 + {0.8032075314806451, 0.5956993044924332}, // k=115 + {0.8314696123025452, 0.5555702330196022}, // k=116 + {0.8577286100002720, 0.5141027441932219}, // k=117 + {0.8819212643483548, 0.4713967368259979}, // k=118 + {0.9039892931234431, 0.4275550934302825}, // k=119 + {0.9238795325112865, 0.3826834323650904}, // k=120 + {0.9415440651830208, 0.3368898533922200}, // k=121 + {0.9569403357322088, 0.2902846772544625}, // k=122 + {0.9700312531945440, 0.2429801799032642}, // k=123 + {0.9807852804032303, 0.1950903220161287}, // k=124 + {0.9891765099647809, 0.1467304744553624}, // k=125 + {0.9951847266721969, 0.0980171403295605}, // k=126 + {0.9987954562051724, 0.0490676743274181}, // k=127 +}; + +__constant__ double2 c_twiddle_inv_128[128] = { + {1.0000000000000000, 0.0000000000000000}, // k=0 + {0.9987954562051724, 0.0490676743274180}, // k=1 + {0.9951847266721969, 0.0980171403295606}, // k=2 + {0.9891765099647810, 0.1467304744553617}, // k=3 + {0.9807852804032304, 0.1950903220161282}, // k=4 + {0.9700312531945440, 0.2429801799032639}, // k=5 + {0.9569403357322088, 0.2902846772544623}, // k=6 + {0.9415440651830208, 0.3368898533922201}, // k=7 + {0.9238795325112867, 0.3826834323650898}, // k=8 + {0.9039892931234433, 0.4275550934302821}, // k=9 + {0.8819212643483550, 0.4713967368259976}, // k=10 + {0.8577286100002721, 0.5141027441932217}, // k=11 + {0.8314696123025452, 0.5555702330196022}, // k=12 + {0.8032075314806449, 0.5956993044924334}, // k=13 + {0.7730104533627370, 0.6343932841636455}, // k=14 + {0.7409511253549591, 0.6715589548470183}, // k=15 + {0.7071067811865476, 0.7071067811865475}, // k=16 + {0.6715589548470183, 0.7409511253549591}, // k=17 + {0.6343932841636455, 0.7730104533627370}, // k=18 + {0.5956993044924335, 0.8032075314806448}, // k=19 + {0.5555702330196023, 0.8314696123025452}, // k=20 + {0.5141027441932217, 0.8577286100002721}, // k=21 + {0.4713967368259978, 0.8819212643483549}, // k=22 + {0.4275550934302822, 0.9039892931234433}, // k=23 + {0.3826834323650898, 0.9238795325112867}, // k=24 + {0.3368898533922201, 0.9415440651830208}, // k=25 + {0.2902846772544623, 0.9569403357322089}, // k=26 + {0.2429801799032640, 0.9700312531945440}, // k=27 + {0.1950903220161283, 0.9807852804032304}, // k=28 + {0.1467304744553617, 0.9891765099647810}, // k=29 + {0.0980171403295608, 0.9951847266721968}, // k=30 + {0.0490676743274181, 0.9987954562051724}, // k=31 + {0.0000000000000001, 1.0000000000000000}, // k=32 + {-0.0490676743274180, 0.9987954562051724}, // k=33 + {-0.0980171403295606, 0.9951847266721969}, // k=34 + {-0.1467304744553616, 0.9891765099647810}, // k=35 + {-0.1950903220161282, 0.9807852804032304}, // k=36 + {-0.2429801799032639, 0.9700312531945440}, // k=37 + {-0.2902846772544622, 0.9569403357322089}, // k=38 + {-0.3368898533922199, 0.9415440651830208}, // k=39 + {-0.3826834323650897, 0.9238795325112867}, // k=40 + {-0.4275550934302819, 0.9039892931234434}, // k=41 + {-0.4713967368259977, 0.8819212643483550}, // k=42 + {-0.5141027441932217, 0.8577286100002721}, // k=43 + {-0.5555702330196020, 0.8314696123025455}, // k=44 + {-0.5956993044924334, 0.8032075314806449}, // k=45 + {-0.6343932841636454, 0.7730104533627371}, // k=46 + {-0.6715589548470184, 0.7409511253549590}, // k=47 + {-0.7071067811865475, 0.7071067811865476}, // k=48 + {-0.7409511253549589, 0.6715589548470186}, // k=49 + {-0.7730104533627370, 0.6343932841636455}, // k=50 + {-0.8032075314806448, 0.5956993044924335}, // k=51 + {-0.8314696123025453, 0.5555702330196022}, // k=52 + {-0.8577286100002720, 0.5141027441932218}, // k=53 + {-0.8819212643483549, 0.4713967368259979}, // k=54 + {-0.9039892931234433, 0.4275550934302820}, // k=55 + {-0.9238795325112867, 0.3826834323650899}, // k=56 + {-0.9415440651830207, 0.3368898533922203}, // k=57 + {-0.9569403357322088, 0.2902846772544624}, // k=58 + {-0.9700312531945440, 0.2429801799032641}, // k=59 + {-0.9807852804032304, 0.1950903220161286}, // k=60 + {-0.9891765099647810, 0.1467304744553618}, // k=61 + {-0.9951847266721968, 0.0980171403295608}, // k=62 + {-0.9987954562051724, 0.0490676743274180}, // k=63 + {-1.0000000000000000, 0.0000000000000001}, // k=64 + {-0.9987954562051724, -0.0490676743274177}, // k=65 + {-0.9951847266721969, -0.0980171403295606}, // k=66 + {-0.9891765099647810, -0.1467304744553616}, // k=67 + {-0.9807852804032304, -0.1950903220161284}, // k=68 + {-0.9700312531945440, -0.2429801799032638}, // k=69 + {-0.9569403357322089, -0.2902846772544621}, // k=70 + {-0.9415440651830208, -0.3368898533922201}, // k=71 + {-0.9238795325112868, -0.3826834323650897}, // k=72 + {-0.9039892931234434, -0.4275550934302818}, // k=73 + {-0.8819212643483550, -0.4713967368259976}, // k=74 + {-0.8577286100002721, -0.5141027441932216}, // k=75 + {-0.8314696123025455, -0.5555702330196020}, // k=76 + {-0.8032075314806449, -0.5956993044924332}, // k=77 + {-0.7730104533627371, -0.6343932841636453}, // k=78 + {-0.7409511253549591, -0.6715589548470184}, // k=79 + {-0.7071067811865477, -0.7071067811865475}, // k=80 + {-0.6715589548470187, -0.7409511253549589}, // k=81 + {-0.6343932841636459, -0.7730104533627367}, // k=82 + {-0.5956993044924331, -0.8032075314806451}, // k=83 + {-0.5555702330196022, -0.8314696123025452}, // k=84 + {-0.5141027441932218, -0.8577286100002720}, // k=85 + {-0.4713967368259979, -0.8819212643483549}, // k=86 + {-0.4275550934302825, -0.9039892931234431}, // k=87 + {-0.3826834323650903, -0.9238795325112865}, // k=88 + {-0.3368898533922199, -0.9415440651830208}, // k=89 + {-0.2902846772544624, -0.9569403357322088}, // k=90 + {-0.2429801799032641, -0.9700312531945440}, // k=91 + {-0.1950903220161287, -0.9807852804032303}, // k=92 + {-0.1467304744553623, -0.9891765099647809}, // k=93 + {-0.0980171403295605, -0.9951847266721969}, // k=94 + {-0.0490676743274180, -0.9987954562051724}, // k=95 + {-0.0000000000000002, -1.0000000000000000}, // k=96 + {0.0490676743274177, -0.9987954562051724}, // k=97 + {0.0980171403295601, -0.9951847266721969}, // k=98 + {0.1467304744553619, -0.9891765099647809}, // k=99 + {0.1950903220161283, -0.9807852804032304}, // k=100 + {0.2429801799032638, -0.9700312531945440}, // k=101 + {0.2902846772544621, -0.9569403357322089}, // k=102 + {0.3368898533922196, -0.9415440651830209}, // k=103 + {0.3826834323650900, -0.9238795325112866}, // k=104 + {0.4275550934302821, -0.9039892931234433}, // k=105 + {0.4713967368259976, -0.8819212643483550}, // k=106 + {0.5141027441932216, -0.8577286100002722}, // k=107 + {0.5555702330196018, -0.8314696123025455}, // k=108 + {0.5956993044924329, -0.8032075314806453}, // k=109 + {0.6343932841636456, -0.7730104533627369}, // k=110 + {0.6715589548470183, -0.7409511253549591}, // k=111 + {0.7071067811865474, -0.7071067811865477}, // k=112 + {0.7409511253549589, -0.6715589548470187}, // k=113 + {0.7730104533627367, -0.6343932841636459}, // k=114 + {0.8032075314806451, -0.5956993044924332}, // k=115 + {0.8314696123025452, -0.5555702330196022}, // k=116 + {0.8577286100002720, -0.5141027441932219}, // k=117 + {0.8819212643483548, -0.4713967368259979}, // k=118 + {0.9039892931234431, -0.4275550934302825}, // k=119 + {0.9238795325112865, -0.3826834323650904}, // k=120 + {0.9415440651830208, -0.3368898533922200}, // k=121 + {0.9569403357322088, -0.2902846772544625}, // k=122 + {0.9700312531945440, -0.2429801799032642}, // k=123 + {0.9807852804032303, -0.1950903220161287}, // k=124 + {0.9891765099647809, -0.1467304744553624}, // k=125 + {0.9951847266721969, -0.0980171403295605}, // k=126 + {0.9987954562051724, -0.0490676743274181}, // k=127 +}; +__constant__ double2 c_twiddle_fwd_256[256] = { + {1.0000000000000000, -0.0000000000000000}, // k=0 + {0.9996988186962042, -0.0245412285229123}, // k=1 + {0.9987954562051724, -0.0490676743274180}, // k=2 + {0.9972904566786902, -0.0735645635996674}, // k=3 + {0.9951847266721969, -0.0980171403295606}, // k=4 + {0.9924795345987100, -0.1224106751992162}, // k=5 + {0.9891765099647810, -0.1467304744553617}, // k=6 + {0.9852776423889412, -0.1709618887603012}, // k=7 + {0.9807852804032304, -0.1950903220161282}, // k=8 + {0.9757021300385286, -0.2191012401568698}, // k=9 + {0.9700312531945440, -0.2429801799032639}, // k=10 + {0.9637760657954398, -0.2667127574748984}, // k=11 + {0.9569403357322088, -0.2902846772544623}, // k=12 + {0.9495281805930367, -0.3136817403988915}, // k=13 + {0.9415440651830208, -0.3368898533922201}, // k=14 + {0.9329927988347390, -0.3598950365349881}, // k=15 + {0.9238795325112867, -0.3826834323650898}, // k=16 + {0.9142097557035307, -0.4052413140049899}, // k=17 + {0.9039892931234433, -0.4275550934302821}, // k=18 + {0.8932243011955153, -0.4496113296546065}, // k=19 + {0.8819212643483550, -0.4713967368259976}, // k=20 + {0.8700869911087115, -0.4928981922297840}, // k=21 + {0.8577286100002721, -0.5141027441932217}, // k=22 + {0.8448535652497071, -0.5349976198870972}, // k=23 + {0.8314696123025452, -0.5555702330196022}, // k=24 + {0.8175848131515837, -0.5758081914178453}, // k=25 + {0.8032075314806449, -0.5956993044924334}, // k=26 + {0.7883464276266063, -0.6152315905806268}, // k=27 + {0.7730104533627370, -0.6343932841636455}, // k=28 + {0.7572088465064846, -0.6531728429537768}, // k=29 + {0.7409511253549591, -0.6715589548470183}, // k=30 + {0.7242470829514670, -0.6895405447370668}, // k=31 + {0.7071067811865476, -0.7071067811865475}, // k=32 + {0.6895405447370669, -0.7242470829514669}, // k=33 + {0.6715589548470183, -0.7409511253549591}, // k=34 + {0.6531728429537768, -0.7572088465064845}, // k=35 + {0.6343932841636455, -0.7730104533627370}, // k=36 + {0.6152315905806268, -0.7883464276266062}, // k=37 + {0.5956993044924335, -0.8032075314806448}, // k=38 + {0.5758081914178453, -0.8175848131515837}, // k=39 + {0.5555702330196023, -0.8314696123025452}, // k=40 + {0.5349976198870973, -0.8448535652497070}, // k=41 + {0.5141027441932217, -0.8577286100002721}, // k=42 + {0.4928981922297841, -0.8700869911087113}, // k=43 + {0.4713967368259978, -0.8819212643483549}, // k=44 + {0.4496113296546066, -0.8932243011955153}, // k=45 + {0.4275550934302822, -0.9039892931234433}, // k=46 + {0.4052413140049899, -0.9142097557035307}, // k=47 + {0.3826834323650898, -0.9238795325112867}, // k=48 + {0.3598950365349883, -0.9329927988347388}, // k=49 + {0.3368898533922201, -0.9415440651830208}, // k=50 + {0.3136817403988916, -0.9495281805930367}, // k=51 + {0.2902846772544623, -0.9569403357322089}, // k=52 + {0.2667127574748984, -0.9637760657954398}, // k=53 + {0.2429801799032640, -0.9700312531945440}, // k=54 + {0.2191012401568698, -0.9757021300385286}, // k=55 + {0.1950903220161283, -0.9807852804032304}, // k=56 + {0.1709618887603014, -0.9852776423889412}, // k=57 + {0.1467304744553617, -0.9891765099647810}, // k=58 + {0.1224106751992163, -0.9924795345987100}, // k=59 + {0.0980171403295608, -0.9951847266721968}, // k=60 + {0.0735645635996675, -0.9972904566786902}, // k=61 + {0.0490676743274181, -0.9987954562051724}, // k=62 + {0.0245412285229123, -0.9996988186962042}, // k=63 + {0.0000000000000001, -1.0000000000000000}, // k=64 + {-0.0245412285229121, -0.9996988186962042}, // k=65 + {-0.0490676743274180, -0.9987954562051724}, // k=66 + {-0.0735645635996673, -0.9972904566786902}, // k=67 + {-0.0980171403295606, -0.9951847266721969}, // k=68 + {-0.1224106751992162, -0.9924795345987100}, // k=69 + {-0.1467304744553616, -0.9891765099647810}, // k=70 + {-0.1709618887603012, -0.9852776423889412}, // k=71 + {-0.1950903220161282, -0.9807852804032304}, // k=72 + {-0.2191012401568697, -0.9757021300385286}, // k=73 + {-0.2429801799032639, -0.9700312531945440}, // k=74 + {-0.2667127574748983, -0.9637760657954398}, // k=75 + {-0.2902846772544622, -0.9569403357322089}, // k=76 + {-0.3136817403988914, -0.9495281805930367}, // k=77 + {-0.3368898533922199, -0.9415440651830208}, // k=78 + {-0.3598950365349882, -0.9329927988347388}, // k=79 + {-0.3826834323650897, -0.9238795325112867}, // k=80 + {-0.4052413140049897, -0.9142097557035307}, // k=81 + {-0.4275550934302819, -0.9039892931234434}, // k=82 + {-0.4496113296546067, -0.8932243011955152}, // k=83 + {-0.4713967368259977, -0.8819212643483550}, // k=84 + {-0.4928981922297840, -0.8700869911087115}, // k=85 + {-0.5141027441932217, -0.8577286100002721}, // k=86 + {-0.5349976198870970, -0.8448535652497072}, // k=87 + {-0.5555702330196020, -0.8314696123025455}, // k=88 + {-0.5758081914178453, -0.8175848131515837}, // k=89 + {-0.5956993044924334, -0.8032075314806449}, // k=90 + {-0.6152315905806267, -0.7883464276266063}, // k=91 + {-0.6343932841636454, -0.7730104533627371}, // k=92 + {-0.6531728429537765, -0.7572088465064847}, // k=93 + {-0.6715589548470184, -0.7409511253549590}, // k=94 + {-0.6895405447370669, -0.7242470829514669}, // k=95 + {-0.7071067811865475, -0.7071067811865476}, // k=96 + {-0.7242470829514668, -0.6895405447370671}, // k=97 + {-0.7409511253549589, -0.6715589548470186}, // k=98 + {-0.7572088465064846, -0.6531728429537766}, // k=99 + {-0.7730104533627370, -0.6343932841636455}, // k=100 + {-0.7883464276266062, -0.6152315905806269}, // k=101 + {-0.8032075314806448, -0.5956993044924335}, // k=102 + {-0.8175848131515836, -0.5758081914178454}, // k=103 + {-0.8314696123025453, -0.5555702330196022}, // k=104 + {-0.8448535652497071, -0.5349976198870972}, // k=105 + {-0.8577286100002720, -0.5141027441932218}, // k=106 + {-0.8700869911087113, -0.4928981922297841}, // k=107 + {-0.8819212643483549, -0.4713967368259979}, // k=108 + {-0.8932243011955152, -0.4496113296546069}, // k=109 + {-0.9039892931234433, -0.4275550934302820}, // k=110 + {-0.9142097557035307, -0.4052413140049899}, // k=111 + {-0.9238795325112867, -0.3826834323650899}, // k=112 + {-0.9329927988347388, -0.3598950365349883}, // k=113 + {-0.9415440651830207, -0.3368898533922203}, // k=114 + {-0.9495281805930367, -0.3136817403988914}, // k=115 + {-0.9569403357322088, -0.2902846772544624}, // k=116 + {-0.9637760657954398, -0.2667127574748985}, // k=117 + {-0.9700312531945440, -0.2429801799032641}, // k=118 + {-0.9757021300385285, -0.2191012401568700}, // k=119 + {-0.9807852804032304, -0.1950903220161286}, // k=120 + {-0.9852776423889412, -0.1709618887603012}, // k=121 + {-0.9891765099647810, -0.1467304744553618}, // k=122 + {-0.9924795345987100, -0.1224106751992163}, // k=123 + {-0.9951847266721968, -0.0980171403295608}, // k=124 + {-0.9972904566786902, -0.0735645635996677}, // k=125 + {-0.9987954562051724, -0.0490676743274180}, // k=126 + {-0.9996988186962042, -0.0245412285229123}, // k=127 + {-1.0000000000000000, -0.0000000000000001}, // k=128 + {-0.9996988186962042, 0.0245412285229121}, // k=129 + {-0.9987954562051724, 0.0490676743274177}, // k=130 + {-0.9972904566786902, 0.0735645635996675}, // k=131 + {-0.9951847266721969, 0.0980171403295606}, // k=132 + {-0.9924795345987100, 0.1224106751992161}, // k=133 + {-0.9891765099647810, 0.1467304744553616}, // k=134 + {-0.9852776423889413, 0.1709618887603010}, // k=135 + {-0.9807852804032304, 0.1950903220161284}, // k=136 + {-0.9757021300385286, 0.2191012401568698}, // k=137 + {-0.9700312531945440, 0.2429801799032638}, // k=138 + {-0.9637760657954400, 0.2667127574748983}, // k=139 + {-0.9569403357322089, 0.2902846772544621}, // k=140 + {-0.9495281805930368, 0.3136817403988912}, // k=141 + {-0.9415440651830208, 0.3368898533922201}, // k=142 + {-0.9329927988347390, 0.3598950365349881}, // k=143 + {-0.9238795325112868, 0.3826834323650897}, // k=144 + {-0.9142097557035307, 0.4052413140049897}, // k=145 + {-0.9039892931234434, 0.4275550934302818}, // k=146 + {-0.8932243011955153, 0.4496113296546067}, // k=147 + {-0.8819212643483550, 0.4713967368259976}, // k=148 + {-0.8700869911087115, 0.4928981922297839}, // k=149 + {-0.8577286100002721, 0.5141027441932216}, // k=150 + {-0.8448535652497072, 0.5349976198870969}, // k=151 + {-0.8314696123025455, 0.5555702330196020}, // k=152 + {-0.8175848131515837, 0.5758081914178453}, // k=153 + {-0.8032075314806449, 0.5956993044924332}, // k=154 + {-0.7883464276266063, 0.6152315905806267}, // k=155 + {-0.7730104533627371, 0.6343932841636453}, // k=156 + {-0.7572088465064848, 0.6531728429537765}, // k=157 + {-0.7409511253549591, 0.6715589548470184}, // k=158 + {-0.7242470829514670, 0.6895405447370668}, // k=159 + {-0.7071067811865477, 0.7071067811865475}, // k=160 + {-0.6895405447370671, 0.7242470829514668}, // k=161 + {-0.6715589548470187, 0.7409511253549589}, // k=162 + {-0.6531728429537771, 0.7572088465064842}, // k=163 + {-0.6343932841636459, 0.7730104533627367}, // k=164 + {-0.6152315905806273, 0.7883464276266059}, // k=165 + {-0.5956993044924331, 0.8032075314806451}, // k=166 + {-0.5758081914178452, 0.8175848131515838}, // k=167 + {-0.5555702330196022, 0.8314696123025452}, // k=168 + {-0.5349976198870973, 0.8448535652497070}, // k=169 + {-0.5141027441932218, 0.8577286100002720}, // k=170 + {-0.4928981922297842, 0.8700869911087113}, // k=171 + {-0.4713967368259979, 0.8819212643483549}, // k=172 + {-0.4496113296546069, 0.8932243011955152}, // k=173 + {-0.4275550934302825, 0.9039892931234431}, // k=174 + {-0.4052413140049904, 0.9142097557035305}, // k=175 + {-0.3826834323650903, 0.9238795325112865}, // k=176 + {-0.3598950365349879, 0.9329927988347390}, // k=177 + {-0.3368898533922199, 0.9415440651830208}, // k=178 + {-0.3136817403988915, 0.9495281805930367}, // k=179 + {-0.2902846772544624, 0.9569403357322088}, // k=180 + {-0.2667127574748985, 0.9637760657954398}, // k=181 + {-0.2429801799032641, 0.9700312531945440}, // k=182 + {-0.2191012401568701, 0.9757021300385285}, // k=183 + {-0.1950903220161287, 0.9807852804032303}, // k=184 + {-0.1709618887603017, 0.9852776423889411}, // k=185 + {-0.1467304744553623, 0.9891765099647809}, // k=186 + {-0.1224106751992160, 0.9924795345987101}, // k=187 + {-0.0980171403295605, 0.9951847266721969}, // k=188 + {-0.0735645635996674, 0.9972904566786902}, // k=189 + {-0.0490676743274180, 0.9987954562051724}, // k=190 + {-0.0245412285229124, 0.9996988186962042}, // k=191 + {-0.0000000000000002, 1.0000000000000000}, // k=192 + {0.0245412285229120, 0.9996988186962042}, // k=193 + {0.0490676743274177, 0.9987954562051724}, // k=194 + {0.0735645635996670, 0.9972904566786902}, // k=195 + {0.0980171403295601, 0.9951847266721969}, // k=196 + {0.1224106751992156, 0.9924795345987101}, // k=197 + {0.1467304744553619, 0.9891765099647809}, // k=198 + {0.1709618887603013, 0.9852776423889412}, // k=199 + {0.1950903220161283, 0.9807852804032304}, // k=200 + {0.2191012401568697, 0.9757021300385286}, // k=201 + {0.2429801799032638, 0.9700312531945440}, // k=202 + {0.2667127574748982, 0.9637760657954400}, // k=203 + {0.2902846772544621, 0.9569403357322089}, // k=204 + {0.3136817403988911, 0.9495281805930368}, // k=205 + {0.3368898533922196, 0.9415440651830209}, // k=206 + {0.3598950365349876, 0.9329927988347391}, // k=207 + {0.3826834323650900, 0.9238795325112866}, // k=208 + {0.4052413140049900, 0.9142097557035306}, // k=209 + {0.4275550934302821, 0.9039892931234433}, // k=210 + {0.4496113296546066, 0.8932243011955153}, // k=211 + {0.4713967368259976, 0.8819212643483550}, // k=212 + {0.4928981922297839, 0.8700869911087115}, // k=213 + {0.5141027441932216, 0.8577286100002722}, // k=214 + {0.5349976198870969, 0.8448535652497072}, // k=215 + {0.5555702330196018, 0.8314696123025455}, // k=216 + {0.5758081914178449, 0.8175848131515840}, // k=217 + {0.5956993044924329, 0.8032075314806453}, // k=218 + {0.6152315905806270, 0.7883464276266061}, // k=219 + {0.6343932841636456, 0.7730104533627369}, // k=220 + {0.6531728429537768, 0.7572088465064846}, // k=221 + {0.6715589548470183, 0.7409511253549591}, // k=222 + {0.6895405447370668, 0.7242470829514670}, // k=223 + {0.7071067811865474, 0.7071067811865477}, // k=224 + {0.7242470829514667, 0.6895405447370672}, // k=225 + {0.7409511253549589, 0.6715589548470187}, // k=226 + {0.7572088465064842, 0.6531728429537771}, // k=227 + {0.7730104533627367, 0.6343932841636459}, // k=228 + {0.7883464276266059, 0.6152315905806274}, // k=229 + {0.8032075314806451, 0.5956993044924332}, // k=230 + {0.8175848131515837, 0.5758081914178452}, // k=231 + {0.8314696123025452, 0.5555702330196022}, // k=232 + {0.8448535652497070, 0.5349976198870973}, // k=233 + {0.8577286100002720, 0.5141027441932219}, // k=234 + {0.8700869911087113, 0.4928981922297843}, // k=235 + {0.8819212643483548, 0.4713967368259979}, // k=236 + {0.8932243011955151, 0.4496113296546070}, // k=237 + {0.9039892931234431, 0.4275550934302825}, // k=238 + {0.9142097557035305, 0.4052413140049904}, // k=239 + {0.9238795325112865, 0.3826834323650904}, // k=240 + {0.9329927988347390, 0.3598950365349880}, // k=241 + {0.9415440651830208, 0.3368898533922200}, // k=242 + {0.9495281805930367, 0.3136817403988915}, // k=243 + {0.9569403357322088, 0.2902846772544625}, // k=244 + {0.9637760657954398, 0.2667127574748986}, // k=245 + {0.9700312531945440, 0.2429801799032642}, // k=246 + {0.9757021300385285, 0.2191012401568702}, // k=247 + {0.9807852804032303, 0.1950903220161287}, // k=248 + {0.9852776423889411, 0.1709618887603018}, // k=249 + {0.9891765099647809, 0.1467304744553624}, // k=250 + {0.9924795345987100, 0.1224106751992160}, // k=251 + {0.9951847266721969, 0.0980171403295605}, // k=252 + {0.9972904566786902, 0.0735645635996674}, // k=253 + {0.9987954562051724, 0.0490676743274181}, // k=254 + {0.9996988186962042, 0.0245412285229124}, // k=255 +}; + +__constant__ double2 c_twiddle_inv_256[256] = { + {1.0000000000000000, 0.0000000000000000}, // k=0 + {0.9996988186962042, 0.0245412285229123}, // k=1 + {0.9987954562051724, 0.0490676743274180}, // k=2 + {0.9972904566786902, 0.0735645635996674}, // k=3 + {0.9951847266721969, 0.0980171403295606}, // k=4 + {0.9924795345987100, 0.1224106751992162}, // k=5 + {0.9891765099647810, 0.1467304744553617}, // k=6 + {0.9852776423889412, 0.1709618887603012}, // k=7 + {0.9807852804032304, 0.1950903220161282}, // k=8 + {0.9757021300385286, 0.2191012401568698}, // k=9 + {0.9700312531945440, 0.2429801799032639}, // k=10 + {0.9637760657954398, 0.2667127574748984}, // k=11 + {0.9569403357322088, 0.2902846772544623}, // k=12 + {0.9495281805930367, 0.3136817403988915}, // k=13 + {0.9415440651830208, 0.3368898533922201}, // k=14 + {0.9329927988347390, 0.3598950365349881}, // k=15 + {0.9238795325112867, 0.3826834323650898}, // k=16 + {0.9142097557035307, 0.4052413140049899}, // k=17 + {0.9039892931234433, 0.4275550934302821}, // k=18 + {0.8932243011955153, 0.4496113296546065}, // k=19 + {0.8819212643483550, 0.4713967368259976}, // k=20 + {0.8700869911087115, 0.4928981922297840}, // k=21 + {0.8577286100002721, 0.5141027441932217}, // k=22 + {0.8448535652497071, 0.5349976198870972}, // k=23 + {0.8314696123025452, 0.5555702330196022}, // k=24 + {0.8175848131515837, 0.5758081914178453}, // k=25 + {0.8032075314806449, 0.5956993044924334}, // k=26 + {0.7883464276266063, 0.6152315905806268}, // k=27 + {0.7730104533627370, 0.6343932841636455}, // k=28 + {0.7572088465064846, 0.6531728429537768}, // k=29 + {0.7409511253549591, 0.6715589548470183}, // k=30 + {0.7242470829514670, 0.6895405447370668}, // k=31 + {0.7071067811865476, 0.7071067811865475}, // k=32 + {0.6895405447370669, 0.7242470829514669}, // k=33 + {0.6715589548470183, 0.7409511253549591}, // k=34 + {0.6531728429537768, 0.7572088465064845}, // k=35 + {0.6343932841636455, 0.7730104533627370}, // k=36 + {0.6152315905806268, 0.7883464276266062}, // k=37 + {0.5956993044924335, 0.8032075314806448}, // k=38 + {0.5758081914178453, 0.8175848131515837}, // k=39 + {0.5555702330196023, 0.8314696123025452}, // k=40 + {0.5349976198870973, 0.8448535652497070}, // k=41 + {0.5141027441932217, 0.8577286100002721}, // k=42 + {0.4928981922297841, 0.8700869911087113}, // k=43 + {0.4713967368259978, 0.8819212643483549}, // k=44 + {0.4496113296546066, 0.8932243011955153}, // k=45 + {0.4275550934302822, 0.9039892931234433}, // k=46 + {0.4052413140049899, 0.9142097557035307}, // k=47 + {0.3826834323650898, 0.9238795325112867}, // k=48 + {0.3598950365349883, 0.9329927988347388}, // k=49 + {0.3368898533922201, 0.9415440651830208}, // k=50 + {0.3136817403988916, 0.9495281805930367}, // k=51 + {0.2902846772544623, 0.9569403357322089}, // k=52 + {0.2667127574748984, 0.9637760657954398}, // k=53 + {0.2429801799032640, 0.9700312531945440}, // k=54 + {0.2191012401568698, 0.9757021300385286}, // k=55 + {0.1950903220161283, 0.9807852804032304}, // k=56 + {0.1709618887603014, 0.9852776423889412}, // k=57 + {0.1467304744553617, 0.9891765099647810}, // k=58 + {0.1224106751992163, 0.9924795345987100}, // k=59 + {0.0980171403295608, 0.9951847266721968}, // k=60 + {0.0735645635996675, 0.9972904566786902}, // k=61 + {0.0490676743274181, 0.9987954562051724}, // k=62 + {0.0245412285229123, 0.9996988186962042}, // k=63 + {0.0000000000000001, 1.0000000000000000}, // k=64 + {-0.0245412285229121, 0.9996988186962042}, // k=65 + {-0.0490676743274180, 0.9987954562051724}, // k=66 + {-0.0735645635996673, 0.9972904566786902}, // k=67 + {-0.0980171403295606, 0.9951847266721969}, // k=68 + {-0.1224106751992162, 0.9924795345987100}, // k=69 + {-0.1467304744553616, 0.9891765099647810}, // k=70 + {-0.1709618887603012, 0.9852776423889412}, // k=71 + {-0.1950903220161282, 0.9807852804032304}, // k=72 + {-0.2191012401568697, 0.9757021300385286}, // k=73 + {-0.2429801799032639, 0.9700312531945440}, // k=74 + {-0.2667127574748983, 0.9637760657954398}, // k=75 + {-0.2902846772544622, 0.9569403357322089}, // k=76 + {-0.3136817403988914, 0.9495281805930367}, // k=77 + {-0.3368898533922199, 0.9415440651830208}, // k=78 + {-0.3598950365349882, 0.9329927988347388}, // k=79 + {-0.3826834323650897, 0.9238795325112867}, // k=80 + {-0.4052413140049897, 0.9142097557035307}, // k=81 + {-0.4275550934302819, 0.9039892931234434}, // k=82 + {-0.4496113296546067, 0.8932243011955152}, // k=83 + {-0.4713967368259977, 0.8819212643483550}, // k=84 + {-0.4928981922297840, 0.8700869911087115}, // k=85 + {-0.5141027441932217, 0.8577286100002721}, // k=86 + {-0.5349976198870970, 0.8448535652497072}, // k=87 + {-0.5555702330196020, 0.8314696123025455}, // k=88 + {-0.5758081914178453, 0.8175848131515837}, // k=89 + {-0.5956993044924334, 0.8032075314806449}, // k=90 + {-0.6152315905806267, 0.7883464276266063}, // k=91 + {-0.6343932841636454, 0.7730104533627371}, // k=92 + {-0.6531728429537765, 0.7572088465064847}, // k=93 + {-0.6715589548470184, 0.7409511253549590}, // k=94 + {-0.6895405447370669, 0.7242470829514669}, // k=95 + {-0.7071067811865475, 0.7071067811865476}, // k=96 + {-0.7242470829514668, 0.6895405447370671}, // k=97 + {-0.7409511253549589, 0.6715589548470186}, // k=98 + {-0.7572088465064846, 0.6531728429537766}, // k=99 + {-0.7730104533627370, 0.6343932841636455}, // k=100 + {-0.7883464276266062, 0.6152315905806269}, // k=101 + {-0.8032075314806448, 0.5956993044924335}, // k=102 + {-0.8175848131515836, 0.5758081914178454}, // k=103 + {-0.8314696123025453, 0.5555702330196022}, // k=104 + {-0.8448535652497071, 0.5349976198870972}, // k=105 + {-0.8577286100002720, 0.5141027441932218}, // k=106 + {-0.8700869911087113, 0.4928981922297841}, // k=107 + {-0.8819212643483549, 0.4713967368259979}, // k=108 + {-0.8932243011955152, 0.4496113296546069}, // k=109 + {-0.9039892931234433, 0.4275550934302820}, // k=110 + {-0.9142097557035307, 0.4052413140049899}, // k=111 + {-0.9238795325112867, 0.3826834323650899}, // k=112 + {-0.9329927988347388, 0.3598950365349883}, // k=113 + {-0.9415440651830207, 0.3368898533922203}, // k=114 + {-0.9495281805930367, 0.3136817403988914}, // k=115 + {-0.9569403357322088, 0.2902846772544624}, // k=116 + {-0.9637760657954398, 0.2667127574748985}, // k=117 + {-0.9700312531945440, 0.2429801799032641}, // k=118 + {-0.9757021300385285, 0.2191012401568700}, // k=119 + {-0.9807852804032304, 0.1950903220161286}, // k=120 + {-0.9852776423889412, 0.1709618887603012}, // k=121 + {-0.9891765099647810, 0.1467304744553618}, // k=122 + {-0.9924795345987100, 0.1224106751992163}, // k=123 + {-0.9951847266721968, 0.0980171403295608}, // k=124 + {-0.9972904566786902, 0.0735645635996677}, // k=125 + {-0.9987954562051724, 0.0490676743274180}, // k=126 + {-0.9996988186962042, 0.0245412285229123}, // k=127 + {-1.0000000000000000, 0.0000000000000001}, // k=128 + {-0.9996988186962042, -0.0245412285229121}, // k=129 + {-0.9987954562051724, -0.0490676743274177}, // k=130 + {-0.9972904566786902, -0.0735645635996675}, // k=131 + {-0.9951847266721969, -0.0980171403295606}, // k=132 + {-0.9924795345987100, -0.1224106751992161}, // k=133 + {-0.9891765099647810, -0.1467304744553616}, // k=134 + {-0.9852776423889413, -0.1709618887603010}, // k=135 + {-0.9807852804032304, -0.1950903220161284}, // k=136 + {-0.9757021300385286, -0.2191012401568698}, // k=137 + {-0.9700312531945440, -0.2429801799032638}, // k=138 + {-0.9637760657954400, -0.2667127574748983}, // k=139 + {-0.9569403357322089, -0.2902846772544621}, // k=140 + {-0.9495281805930368, -0.3136817403988912}, // k=141 + {-0.9415440651830208, -0.3368898533922201}, // k=142 + {-0.9329927988347390, -0.3598950365349881}, // k=143 + {-0.9238795325112868, -0.3826834323650897}, // k=144 + {-0.9142097557035307, -0.4052413140049897}, // k=145 + {-0.9039892931234434, -0.4275550934302818}, // k=146 + {-0.8932243011955153, -0.4496113296546067}, // k=147 + {-0.8819212643483550, -0.4713967368259976}, // k=148 + {-0.8700869911087115, -0.4928981922297839}, // k=149 + {-0.8577286100002721, -0.5141027441932216}, // k=150 + {-0.8448535652497072, -0.5349976198870969}, // k=151 + {-0.8314696123025455, -0.5555702330196020}, // k=152 + {-0.8175848131515837, -0.5758081914178453}, // k=153 + {-0.8032075314806449, -0.5956993044924332}, // k=154 + {-0.7883464276266063, -0.6152315905806267}, // k=155 + {-0.7730104533627371, -0.6343932841636453}, // k=156 + {-0.7572088465064848, -0.6531728429537765}, // k=157 + {-0.7409511253549591, -0.6715589548470184}, // k=158 + {-0.7242470829514670, -0.6895405447370668}, // k=159 + {-0.7071067811865477, -0.7071067811865475}, // k=160 + {-0.6895405447370671, -0.7242470829514668}, // k=161 + {-0.6715589548470187, -0.7409511253549589}, // k=162 + {-0.6531728429537771, -0.7572088465064842}, // k=163 + {-0.6343932841636459, -0.7730104533627367}, // k=164 + {-0.6152315905806273, -0.7883464276266059}, // k=165 + {-0.5956993044924331, -0.8032075314806451}, // k=166 + {-0.5758081914178452, -0.8175848131515838}, // k=167 + {-0.5555702330196022, -0.8314696123025452}, // k=168 + {-0.5349976198870973, -0.8448535652497070}, // k=169 + {-0.5141027441932218, -0.8577286100002720}, // k=170 + {-0.4928981922297842, -0.8700869911087113}, // k=171 + {-0.4713967368259979, -0.8819212643483549}, // k=172 + {-0.4496113296546069, -0.8932243011955152}, // k=173 + {-0.4275550934302825, -0.9039892931234431}, // k=174 + {-0.4052413140049904, -0.9142097557035305}, // k=175 + {-0.3826834323650903, -0.9238795325112865}, // k=176 + {-0.3598950365349879, -0.9329927988347390}, // k=177 + {-0.3368898533922199, -0.9415440651830208}, // k=178 + {-0.3136817403988915, -0.9495281805930367}, // k=179 + {-0.2902846772544624, -0.9569403357322088}, // k=180 + {-0.2667127574748985, -0.9637760657954398}, // k=181 + {-0.2429801799032641, -0.9700312531945440}, // k=182 + {-0.2191012401568701, -0.9757021300385285}, // k=183 + {-0.1950903220161287, -0.9807852804032303}, // k=184 + {-0.1709618887603017, -0.9852776423889411}, // k=185 + {-0.1467304744553623, -0.9891765099647809}, // k=186 + {-0.1224106751992160, -0.9924795345987101}, // k=187 + {-0.0980171403295605, -0.9951847266721969}, // k=188 + {-0.0735645635996674, -0.9972904566786902}, // k=189 + {-0.0490676743274180, -0.9987954562051724}, // k=190 + {-0.0245412285229124, -0.9996988186962042}, // k=191 + {-0.0000000000000002, -1.0000000000000000}, // k=192 + {0.0245412285229120, -0.9996988186962042}, // k=193 + {0.0490676743274177, -0.9987954562051724}, // k=194 + {0.0735645635996670, -0.9972904566786902}, // k=195 + {0.0980171403295601, -0.9951847266721969}, // k=196 + {0.1224106751992156, -0.9924795345987101}, // k=197 + {0.1467304744553619, -0.9891765099647809}, // k=198 + {0.1709618887603013, -0.9852776423889412}, // k=199 + {0.1950903220161283, -0.9807852804032304}, // k=200 + {0.2191012401568697, -0.9757021300385286}, // k=201 + {0.2429801799032638, -0.9700312531945440}, // k=202 + {0.2667127574748982, -0.9637760657954400}, // k=203 + {0.2902846772544621, -0.9569403357322089}, // k=204 + {0.3136817403988911, -0.9495281805930368}, // k=205 + {0.3368898533922196, -0.9415440651830209}, // k=206 + {0.3598950365349876, -0.9329927988347391}, // k=207 + {0.3826834323650900, -0.9238795325112866}, // k=208 + {0.4052413140049900, -0.9142097557035306}, // k=209 + {0.4275550934302821, -0.9039892931234433}, // k=210 + {0.4496113296546066, -0.8932243011955153}, // k=211 + {0.4713967368259976, -0.8819212643483550}, // k=212 + {0.4928981922297839, -0.8700869911087115}, // k=213 + {0.5141027441932216, -0.8577286100002722}, // k=214 + {0.5349976198870969, -0.8448535652497072}, // k=215 + {0.5555702330196018, -0.8314696123025455}, // k=216 + {0.5758081914178449, -0.8175848131515840}, // k=217 + {0.5956993044924329, -0.8032075314806453}, // k=218 + {0.6152315905806270, -0.7883464276266061}, // k=219 + {0.6343932841636456, -0.7730104533627369}, // k=220 + {0.6531728429537768, -0.7572088465064846}, // k=221 + {0.6715589548470183, -0.7409511253549591}, // k=222 + {0.6895405447370668, -0.7242470829514670}, // k=223 + {0.7071067811865474, -0.7071067811865477}, // k=224 + {0.7242470829514667, -0.6895405447370672}, // k=225 + {0.7409511253549589, -0.6715589548470187}, // k=226 + {0.7572088465064842, -0.6531728429537771}, // k=227 + {0.7730104533627367, -0.6343932841636459}, // k=228 + {0.7883464276266059, -0.6152315905806274}, // k=229 + {0.8032075314806451, -0.5956993044924332}, // k=230 + {0.8175848131515837, -0.5758081914178452}, // k=231 + {0.8314696123025452, -0.5555702330196022}, // k=232 + {0.8448535652497070, -0.5349976198870973}, // k=233 + {0.8577286100002720, -0.5141027441932219}, // k=234 + {0.8700869911087113, -0.4928981922297843}, // k=235 + {0.8819212643483548, -0.4713967368259979}, // k=236 + {0.8932243011955151, -0.4496113296546070}, // k=237 + {0.9039892931234431, -0.4275550934302825}, // k=238 + {0.9142097557035305, -0.4052413140049904}, // k=239 + {0.9238795325112865, -0.3826834323650904}, // k=240 + {0.9329927988347390, -0.3598950365349880}, // k=241 + {0.9415440651830208, -0.3368898533922200}, // k=242 + {0.9495281805930367, -0.3136817403988915}, // k=243 + {0.9569403357322088, -0.2902846772544625}, // k=244 + {0.9637760657954398, -0.2667127574748986}, // k=245 + {0.9700312531945440, -0.2429801799032642}, // k=246 + {0.9757021300385285, -0.2191012401568702}, // k=247 + {0.9807852804032303, -0.1950903220161287}, // k=248 + {0.9852776423889411, -0.1709618887603018}, // k=249 + {0.9891765099647809, -0.1467304744553624}, // k=250 + {0.9924795345987100, -0.1224106751992160}, // k=251 + {0.9951847266721969, -0.0980171403295605}, // k=252 + {0.9972904566786902, -0.0735645635996674}, // k=253 + {0.9987954562051724, -0.0490676743274181}, // k=254 + {0.9996988186962042, -0.0245412285229124}, // k=255 +}; +__constant__ double2 c_twiddle_fwd_512[512] = { + {1.0000000000000000, -0.0000000000000000}, // k=0 + {0.9999247018391445, -0.0122715382857199}, // k=1 + {0.9996988186962042, -0.0245412285229123}, // k=2 + {0.9993223845883495, -0.0368072229413588}, // k=3 + {0.9987954562051724, -0.0490676743274180}, // k=4 + {0.9981181129001492, -0.0613207363022086}, // k=5 + {0.9972904566786902, -0.0735645635996674}, // k=6 + {0.9963126121827780, -0.0857973123444399}, // k=7 + {0.9951847266721969, -0.0980171403295606}, // k=8 + {0.9939069700023561, -0.1102222072938831}, // k=9 + {0.9924795345987100, -0.1224106751992162}, // k=10 + {0.9909026354277800, -0.1345807085071262}, // k=11 + {0.9891765099647810, -0.1467304744553617}, // k=12 + {0.9873014181578584, -0.1588581433338614}, // k=13 + {0.9852776423889412, -0.1709618887603012}, // k=14 + {0.9831054874312163, -0.1830398879551410}, // k=15 + {0.9807852804032304, -0.1950903220161282}, // k=16 + {0.9783173707196277, -0.2071113761922186}, // k=17 + {0.9757021300385286, -0.2191012401568698}, // k=18 + {0.9729399522055602, -0.2310581082806711}, // k=19 + {0.9700312531945440, -0.2429801799032639}, // k=20 + {0.9669764710448521, -0.2548656596045146}, // k=21 + {0.9637760657954398, -0.2667127574748984}, // k=22 + {0.9604305194155658, -0.2785196893850531}, // k=23 + {0.9569403357322088, -0.2902846772544623}, // k=24 + {0.9533060403541939, -0.3020059493192281}, // k=25 + {0.9495281805930367, -0.3136817403988915}, // k=26 + {0.9456073253805213, -0.3253102921622629}, // k=27 + {0.9415440651830208, -0.3368898533922201}, // k=28 + {0.9373390119125750, -0.3484186802494346}, // k=29 + {0.9329927988347390, -0.3598950365349881}, // k=30 + {0.9285060804732156, -0.3713171939518375}, // k=31 + {0.9238795325112867, -0.3826834323650898}, // k=32 + {0.9191138516900578, -0.3939920400610481}, // k=33 + {0.9142097557035307, -0.4052413140049899}, // k=34 + {0.9091679830905224, -0.4164295600976372}, // k=35 + {0.9039892931234433, -0.4275550934302821}, // k=36 + {0.8986744656939538, -0.4386162385385277}, // k=37 + {0.8932243011955153, -0.4496113296546065}, // k=38 + {0.8876396204028539, -0.4605387109582400}, // k=39 + {0.8819212643483550, -0.4713967368259976}, // k=40 + {0.8760700941954066, -0.4821837720791227}, // k=41 + {0.8700869911087115, -0.4928981922297840}, // k=42 + {0.8639728561215868, -0.5035383837257176}, // k=43 + {0.8577286100002721, -0.5141027441932217}, // k=44 + {0.8513551931052652, -0.5245896826784689}, // k=45 + {0.8448535652497071, -0.5349976198870972}, // k=46 + {0.8382247055548381, -0.5453249884220465}, // k=47 + {0.8314696123025452, -0.5555702330196022}, // k=48 + {0.8245893027850253, -0.5657318107836131}, // k=49 + {0.8175848131515837, -0.5758081914178453}, // k=50 + {0.8104571982525948, -0.5857978574564389}, // k=51 + {0.8032075314806449, -0.5956993044924334}, // k=52 + {0.7958369046088836, -0.6055110414043255}, // k=53 + {0.7883464276266063, -0.6152315905806268}, // k=54 + {0.7807372285720945, -0.6248594881423863}, // k=55 + {0.7730104533627370, -0.6343932841636455}, // k=56 + {0.7651672656224590, -0.6438315428897914}, // k=57 + {0.7572088465064846, -0.6531728429537768}, // k=58 + {0.7491363945234594, -0.6624157775901718}, // k=59 + {0.7409511253549591, -0.6715589548470183}, // k=60 + {0.7326542716724128, -0.6806009977954530}, // k=61 + {0.7242470829514670, -0.6895405447370668}, // k=62 + {0.7157308252838186, -0.6983762494089729}, // k=63 + {0.7071067811865476, -0.7071067811865475}, // k=64 + {0.6983762494089729, -0.7157308252838186}, // k=65 + {0.6895405447370669, -0.7242470829514669}, // k=66 + {0.6806009977954531, -0.7326542716724128}, // k=67 + {0.6715589548470183, -0.7409511253549591}, // k=68 + {0.6624157775901718, -0.7491363945234593}, // k=69 + {0.6531728429537768, -0.7572088465064845}, // k=70 + {0.6438315428897915, -0.7651672656224590}, // k=71 + {0.6343932841636455, -0.7730104533627370}, // k=72 + {0.6248594881423865, -0.7807372285720944}, // k=73 + {0.6152315905806268, -0.7883464276266062}, // k=74 + {0.6055110414043255, -0.7958369046088835}, // k=75 + {0.5956993044924335, -0.8032075314806448}, // k=76 + {0.5857978574564389, -0.8104571982525948}, // k=77 + {0.5758081914178453, -0.8175848131515837}, // k=78 + {0.5657318107836132, -0.8245893027850253}, // k=79 + {0.5555702330196023, -0.8314696123025452}, // k=80 + {0.5453249884220465, -0.8382247055548380}, // k=81 + {0.5349976198870973, -0.8448535652497070}, // k=82 + {0.5245896826784688, -0.8513551931052652}, // k=83 + {0.5141027441932217, -0.8577286100002721}, // k=84 + {0.5035383837257176, -0.8639728561215867}, // k=85 + {0.4928981922297841, -0.8700869911087113}, // k=86 + {0.4821837720791228, -0.8760700941954066}, // k=87 + {0.4713967368259978, -0.8819212643483549}, // k=88 + {0.4605387109582400, -0.8876396204028539}, // k=89 + {0.4496113296546066, -0.8932243011955153}, // k=90 + {0.4386162385385277, -0.8986744656939538}, // k=91 + {0.4275550934302822, -0.9039892931234433}, // k=92 + {0.4164295600976373, -0.9091679830905223}, // k=93 + {0.4052413140049899, -0.9142097557035307}, // k=94 + {0.3939920400610481, -0.9191138516900578}, // k=95 + {0.3826834323650898, -0.9238795325112867}, // k=96 + {0.3713171939518376, -0.9285060804732155}, // k=97 + {0.3598950365349883, -0.9329927988347388}, // k=98 + {0.3484186802494345, -0.9373390119125750}, // k=99 + {0.3368898533922201, -0.9415440651830208}, // k=100 + {0.3253102921622630, -0.9456073253805213}, // k=101 + {0.3136817403988916, -0.9495281805930367}, // k=102 + {0.3020059493192282, -0.9533060403541938}, // k=103 + {0.2902846772544623, -0.9569403357322089}, // k=104 + {0.2785196893850531, -0.9604305194155658}, // k=105 + {0.2667127574748984, -0.9637760657954398}, // k=106 + {0.2548656596045146, -0.9669764710448521}, // k=107 + {0.2429801799032640, -0.9700312531945440}, // k=108 + {0.2310581082806713, -0.9729399522055601}, // k=109 + {0.2191012401568698, -0.9757021300385286}, // k=110 + {0.2071113761922186, -0.9783173707196277}, // k=111 + {0.1950903220161283, -0.9807852804032304}, // k=112 + {0.1830398879551411, -0.9831054874312163}, // k=113 + {0.1709618887603014, -0.9852776423889412}, // k=114 + {0.1588581433338614, -0.9873014181578584}, // k=115 + {0.1467304744553617, -0.9891765099647810}, // k=116 + {0.1345807085071262, -0.9909026354277800}, // k=117 + {0.1224106751992163, -0.9924795345987100}, // k=118 + {0.1102222072938832, -0.9939069700023561}, // k=119 + {0.0980171403295608, -0.9951847266721968}, // k=120 + {0.0857973123444399, -0.9963126121827780}, // k=121 + {0.0735645635996675, -0.9972904566786902}, // k=122 + {0.0613207363022086, -0.9981181129001492}, // k=123 + {0.0490676743274181, -0.9987954562051724}, // k=124 + {0.0368072229413590, -0.9993223845883495}, // k=125 + {0.0245412285229123, -0.9996988186962042}, // k=126 + {0.0122715382857199, -0.9999247018391445}, // k=127 + {0.0000000000000001, -1.0000000000000000}, // k=128 + {-0.0122715382857198, -0.9999247018391445}, // k=129 + {-0.0245412285229121, -0.9996988186962042}, // k=130 + {-0.0368072229413589, -0.9993223845883495}, // k=131 + {-0.0490676743274180, -0.9987954562051724}, // k=132 + {-0.0613207363022085, -0.9981181129001492}, // k=133 + {-0.0735645635996673, -0.9972904566786902}, // k=134 + {-0.0857973123444398, -0.9963126121827780}, // k=135 + {-0.0980171403295606, -0.9951847266721969}, // k=136 + {-0.1102222072938831, -0.9939069700023561}, // k=137 + {-0.1224106751992162, -0.9924795345987100}, // k=138 + {-0.1345807085071261, -0.9909026354277800}, // k=139 + {-0.1467304744553616, -0.9891765099647810}, // k=140 + {-0.1588581433338613, -0.9873014181578584}, // k=141 + {-0.1709618887603012, -0.9852776423889412}, // k=142 + {-0.1830398879551409, -0.9831054874312163}, // k=143 + {-0.1950903220161282, -0.9807852804032304}, // k=144 + {-0.2071113761922184, -0.9783173707196277}, // k=145 + {-0.2191012401568697, -0.9757021300385286}, // k=146 + {-0.2310581082806711, -0.9729399522055602}, // k=147 + {-0.2429801799032639, -0.9700312531945440}, // k=148 + {-0.2548656596045145, -0.9669764710448521}, // k=149 + {-0.2667127574748983, -0.9637760657954398}, // k=150 + {-0.2785196893850529, -0.9604305194155659}, // k=151 + {-0.2902846772544622, -0.9569403357322089}, // k=152 + {-0.3020059493192281, -0.9533060403541939}, // k=153 + {-0.3136817403988914, -0.9495281805930367}, // k=154 + {-0.3253102921622629, -0.9456073253805214}, // k=155 + {-0.3368898533922199, -0.9415440651830208}, // k=156 + {-0.3484186802494344, -0.9373390119125750}, // k=157 + {-0.3598950365349882, -0.9329927988347388}, // k=158 + {-0.3713171939518375, -0.9285060804732156}, // k=159 + {-0.3826834323650897, -0.9238795325112867}, // k=160 + {-0.3939920400610480, -0.9191138516900578}, // k=161 + {-0.4052413140049897, -0.9142097557035307}, // k=162 + {-0.4164295600976370, -0.9091679830905225}, // k=163 + {-0.4275550934302819, -0.9039892931234434}, // k=164 + {-0.4386162385385274, -0.8986744656939539}, // k=165 + {-0.4496113296546067, -0.8932243011955152}, // k=166 + {-0.4605387109582401, -0.8876396204028539}, // k=167 + {-0.4713967368259977, -0.8819212643483550}, // k=168 + {-0.4821837720791227, -0.8760700941954066}, // k=169 + {-0.4928981922297840, -0.8700869911087115}, // k=170 + {-0.5035383837257175, -0.8639728561215868}, // k=171 + {-0.5141027441932217, -0.8577286100002721}, // k=172 + {-0.5245896826784687, -0.8513551931052652}, // k=173 + {-0.5349976198870970, -0.8448535652497072}, // k=174 + {-0.5453249884220462, -0.8382247055548382}, // k=175 + {-0.5555702330196020, -0.8314696123025455}, // k=176 + {-0.5657318107836132, -0.8245893027850252}, // k=177 + {-0.5758081914178453, -0.8175848131515837}, // k=178 + {-0.5857978574564389, -0.8104571982525948}, // k=179 + {-0.5956993044924334, -0.8032075314806449}, // k=180 + {-0.6055110414043254, -0.7958369046088836}, // k=181 + {-0.6152315905806267, -0.7883464276266063}, // k=182 + {-0.6248594881423862, -0.7807372285720946}, // k=183 + {-0.6343932841636454, -0.7730104533627371}, // k=184 + {-0.6438315428897913, -0.7651672656224591}, // k=185 + {-0.6531728429537765, -0.7572088465064847}, // k=186 + {-0.6624157775901719, -0.7491363945234593}, // k=187 + {-0.6715589548470184, -0.7409511253549590}, // k=188 + {-0.6806009977954530, -0.7326542716724128}, // k=189 + {-0.6895405447370669, -0.7242470829514669}, // k=190 + {-0.6983762494089728, -0.7157308252838187}, // k=191 + {-0.7071067811865475, -0.7071067811865476}, // k=192 + {-0.7157308252838186, -0.6983762494089729}, // k=193 + {-0.7242470829514668, -0.6895405447370671}, // k=194 + {-0.7326542716724127, -0.6806009977954532}, // k=195 + {-0.7409511253549589, -0.6715589548470186}, // k=196 + {-0.7491363945234591, -0.6624157775901720}, // k=197 + {-0.7572088465064846, -0.6531728429537766}, // k=198 + {-0.7651672656224590, -0.6438315428897914}, // k=199 + {-0.7730104533627370, -0.6343932841636455}, // k=200 + {-0.7807372285720945, -0.6248594881423863}, // k=201 + {-0.7883464276266062, -0.6152315905806269}, // k=202 + {-0.7958369046088835, -0.6055110414043257}, // k=203 + {-0.8032075314806448, -0.5956993044924335}, // k=204 + {-0.8104571982525947, -0.5857978574564390}, // k=205 + {-0.8175848131515836, -0.5758081914178454}, // k=206 + {-0.8245893027850251, -0.5657318107836135}, // k=207 + {-0.8314696123025453, -0.5555702330196022}, // k=208 + {-0.8382247055548381, -0.5453249884220464}, // k=209 + {-0.8448535652497071, -0.5349976198870972}, // k=210 + {-0.8513551931052652, -0.5245896826784689}, // k=211 + {-0.8577286100002720, -0.5141027441932218}, // k=212 + {-0.8639728561215867, -0.5035383837257177}, // k=213 + {-0.8700869911087113, -0.4928981922297841}, // k=214 + {-0.8760700941954065, -0.4821837720791229}, // k=215 + {-0.8819212643483549, -0.4713967368259979}, // k=216 + {-0.8876396204028538, -0.4605387109582402}, // k=217 + {-0.8932243011955152, -0.4496113296546069}, // k=218 + {-0.8986744656939539, -0.4386162385385275}, // k=219 + {-0.9039892931234433, -0.4275550934302820}, // k=220 + {-0.9091679830905224, -0.4164295600976372}, // k=221 + {-0.9142097557035307, -0.4052413140049899}, // k=222 + {-0.9191138516900578, -0.3939920400610482}, // k=223 + {-0.9238795325112867, -0.3826834323650899}, // k=224 + {-0.9285060804732155, -0.3713171939518377}, // k=225 + {-0.9329927988347388, -0.3598950365349883}, // k=226 + {-0.9373390119125748, -0.3484186802494348}, // k=227 + {-0.9415440651830207, -0.3368898533922203}, // k=228 + {-0.9456073253805212, -0.3253102921622633}, // k=229 + {-0.9495281805930367, -0.3136817403988914}, // k=230 + {-0.9533060403541939, -0.3020059493192280}, // k=231 + {-0.9569403357322088, -0.2902846772544624}, // k=232 + {-0.9604305194155658, -0.2785196893850532}, // k=233 + {-0.9637760657954398, -0.2667127574748985}, // k=234 + {-0.9669764710448521, -0.2548656596045147}, // k=235 + {-0.9700312531945440, -0.2429801799032641}, // k=236 + {-0.9729399522055601, -0.2310581082806713}, // k=237 + {-0.9757021300385285, -0.2191012401568700}, // k=238 + {-0.9783173707196275, -0.2071113761922188}, // k=239 + {-0.9807852804032304, -0.1950903220161286}, // k=240 + {-0.9831054874312163, -0.1830398879551409}, // k=241 + {-0.9852776423889412, -0.1709618887603012}, // k=242 + {-0.9873014181578584, -0.1588581433338615}, // k=243 + {-0.9891765099647810, -0.1467304744553618}, // k=244 + {-0.9909026354277800, -0.1345807085071263}, // k=245 + {-0.9924795345987100, -0.1224106751992163}, // k=246 + {-0.9939069700023561, -0.1102222072938832}, // k=247 + {-0.9951847266721968, -0.0980171403295608}, // k=248 + {-0.9963126121827780, -0.0857973123444402}, // k=249 + {-0.9972904566786902, -0.0735645635996677}, // k=250 + {-0.9981181129001492, -0.0613207363022085}, // k=251 + {-0.9987954562051724, -0.0490676743274180}, // k=252 + {-0.9993223845883495, -0.0368072229413588}, // k=253 + {-0.9996988186962042, -0.0245412285229123}, // k=254 + {-0.9999247018391445, -0.0122715382857200}, // k=255 + {-1.0000000000000000, -0.0000000000000001}, // k=256 + {-0.9999247018391445, 0.0122715382857198}, // k=257 + {-0.9996988186962042, 0.0245412285229121}, // k=258 + {-0.9993223845883495, 0.0368072229413586}, // k=259 + {-0.9987954562051724, 0.0490676743274177}, // k=260 + {-0.9981181129001492, 0.0613207363022082}, // k=261 + {-0.9972904566786902, 0.0735645635996675}, // k=262 + {-0.9963126121827780, 0.0857973123444399}, // k=263 + {-0.9951847266721969, 0.0980171403295606}, // k=264 + {-0.9939069700023561, 0.1102222072938830}, // k=265 + {-0.9924795345987100, 0.1224106751992161}, // k=266 + {-0.9909026354277800, 0.1345807085071261}, // k=267 + {-0.9891765099647810, 0.1467304744553616}, // k=268 + {-0.9873014181578584, 0.1588581433338612}, // k=269 + {-0.9852776423889413, 0.1709618887603010}, // k=270 + {-0.9831054874312164, 0.1830398879551406}, // k=271 + {-0.9807852804032304, 0.1950903220161284}, // k=272 + {-0.9783173707196277, 0.2071113761922186}, // k=273 + {-0.9757021300385286, 0.2191012401568698}, // k=274 + {-0.9729399522055602, 0.2310581082806711}, // k=275 + {-0.9700312531945440, 0.2429801799032638}, // k=276 + {-0.9669764710448522, 0.2548656596045145}, // k=277 + {-0.9637760657954400, 0.2667127574748983}, // k=278 + {-0.9604305194155659, 0.2785196893850529}, // k=279 + {-0.9569403357322089, 0.2902846772544621}, // k=280 + {-0.9533060403541940, 0.3020059493192278}, // k=281 + {-0.9495281805930368, 0.3136817403988912}, // k=282 + {-0.9456073253805213, 0.3253102921622630}, // k=283 + {-0.9415440651830208, 0.3368898533922201}, // k=284 + {-0.9373390119125750, 0.3484186802494346}, // k=285 + {-0.9329927988347390, 0.3598950365349881}, // k=286 + {-0.9285060804732156, 0.3713171939518374}, // k=287 + {-0.9238795325112868, 0.3826834323650897}, // k=288 + {-0.9191138516900578, 0.3939920400610479}, // k=289 + {-0.9142097557035307, 0.4052413140049897}, // k=290 + {-0.9091679830905225, 0.4164295600976369}, // k=291 + {-0.9039892931234434, 0.4275550934302818}, // k=292 + {-0.8986744656939540, 0.4386162385385273}, // k=293 + {-0.8932243011955153, 0.4496113296546067}, // k=294 + {-0.8876396204028539, 0.4605387109582401}, // k=295 + {-0.8819212643483550, 0.4713967368259976}, // k=296 + {-0.8760700941954066, 0.4821837720791227}, // k=297 + {-0.8700869911087115, 0.4928981922297839}, // k=298 + {-0.8639728561215868, 0.5035383837257175}, // k=299 + {-0.8577286100002721, 0.5141027441932216}, // k=300 + {-0.8513551931052653, 0.5245896826784687}, // k=301 + {-0.8448535652497072, 0.5349976198870969}, // k=302 + {-0.8382247055548382, 0.5453249884220461}, // k=303 + {-0.8314696123025455, 0.5555702330196020}, // k=304 + {-0.8245893027850253, 0.5657318107836132}, // k=305 + {-0.8175848131515837, 0.5758081914178453}, // k=306 + {-0.8104571982525948, 0.5857978574564389}, // k=307 + {-0.8032075314806449, 0.5956993044924332}, // k=308 + {-0.7958369046088836, 0.6055110414043254}, // k=309 + {-0.7883464276266063, 0.6152315905806267}, // k=310 + {-0.7807372285720946, 0.6248594881423862}, // k=311 + {-0.7730104533627371, 0.6343932841636453}, // k=312 + {-0.7651672656224591, 0.6438315428897913}, // k=313 + {-0.7572088465064848, 0.6531728429537765}, // k=314 + {-0.7491363945234593, 0.6624157775901718}, // k=315 + {-0.7409511253549591, 0.6715589548470184}, // k=316 + {-0.7326542716724128, 0.6806009977954530}, // k=317 + {-0.7242470829514670, 0.6895405447370668}, // k=318 + {-0.7157308252838187, 0.6983762494089728}, // k=319 + {-0.7071067811865477, 0.7071067811865475}, // k=320 + {-0.6983762494089730, 0.7157308252838185}, // k=321 + {-0.6895405447370671, 0.7242470829514668}, // k=322 + {-0.6806009977954532, 0.7326542716724126}, // k=323 + {-0.6715589548470187, 0.7409511253549589}, // k=324 + {-0.6624157775901720, 0.7491363945234590}, // k=325 + {-0.6531728429537771, 0.7572088465064842}, // k=326 + {-0.6438315428897915, 0.7651672656224590}, // k=327 + {-0.6343932841636459, 0.7730104533627367}, // k=328 + {-0.6248594881423865, 0.7807372285720944}, // k=329 + {-0.6152315905806273, 0.7883464276266059}, // k=330 + {-0.6055110414043257, 0.7958369046088835}, // k=331 + {-0.5956993044924331, 0.8032075314806451}, // k=332 + {-0.5857978574564391, 0.8104571982525947}, // k=333 + {-0.5758081914178452, 0.8175848131515838}, // k=334 + {-0.5657318107836135, 0.8245893027850251}, // k=335 + {-0.5555702330196022, 0.8314696123025452}, // k=336 + {-0.5453249884220468, 0.8382247055548379}, // k=337 + {-0.5349976198870973, 0.8448535652497070}, // k=338 + {-0.5245896826784694, 0.8513551931052649}, // k=339 + {-0.5141027441932218, 0.8577286100002720}, // k=340 + {-0.5035383837257180, 0.8639728561215865}, // k=341 + {-0.4928981922297842, 0.8700869911087113}, // k=342 + {-0.4821837720791226, 0.8760700941954067}, // k=343 + {-0.4713967368259979, 0.8819212643483549}, // k=344 + {-0.4605387109582399, 0.8876396204028540}, // k=345 + {-0.4496113296546069, 0.8932243011955152}, // k=346 + {-0.4386162385385276, 0.8986744656939538}, // k=347 + {-0.4275550934302825, 0.9039892931234431}, // k=348 + {-0.4164295600976372, 0.9091679830905224}, // k=349 + {-0.4052413140049904, 0.9142097557035305}, // k=350 + {-0.3939920400610482, 0.9191138516900577}, // k=351 + {-0.3826834323650903, 0.9238795325112865}, // k=352 + {-0.3713171939518378, 0.9285060804732155}, // k=353 + {-0.3598950365349879, 0.9329927988347390}, // k=354 + {-0.3484186802494348, 0.9373390119125748}, // k=355 + {-0.3368898533922199, 0.9415440651830208}, // k=356 + {-0.3253102921622633, 0.9456073253805212}, // k=357 + {-0.3136817403988915, 0.9495281805930367}, // k=358 + {-0.3020059493192285, 0.9533060403541938}, // k=359 + {-0.2902846772544624, 0.9569403357322088}, // k=360 + {-0.2785196893850536, 0.9604305194155657}, // k=361 + {-0.2667127574748985, 0.9637760657954398}, // k=362 + {-0.2548656596045143, 0.9669764710448522}, // k=363 + {-0.2429801799032641, 0.9700312531945440}, // k=364 + {-0.2310581082806709, 0.9729399522055602}, // k=365 + {-0.2191012401568701, 0.9757021300385285}, // k=366 + {-0.2071113761922185, 0.9783173707196277}, // k=367 + {-0.1950903220161287, 0.9807852804032303}, // k=368 + {-0.1830398879551410, 0.9831054874312163}, // k=369 + {-0.1709618887603017, 0.9852776423889411}, // k=370 + {-0.1588581433338615, 0.9873014181578583}, // k=371 + {-0.1467304744553623, 0.9891765099647809}, // k=372 + {-0.1345807085071264, 0.9909026354277800}, // k=373 + {-0.1224106751992160, 0.9924795345987101}, // k=374 + {-0.1102222072938833, 0.9939069700023561}, // k=375 + {-0.0980171403295605, 0.9951847266721969}, // k=376 + {-0.0857973123444402, 0.9963126121827780}, // k=377 + {-0.0735645635996674, 0.9972904566786902}, // k=378 + {-0.0613207363022090, 0.9981181129001492}, // k=379 + {-0.0490676743274180, 0.9987954562051724}, // k=380 + {-0.0368072229413593, 0.9993223845883494}, // k=381 + {-0.0245412285229124, 0.9996988186962042}, // k=382 + {-0.0122715382857205, 0.9999247018391445}, // k=383 + {-0.0000000000000002, 1.0000000000000000}, // k=384 + {0.0122715382857201, 0.9999247018391445}, // k=385 + {0.0245412285229120, 0.9996988186962042}, // k=386 + {0.0368072229413590, 0.9993223845883495}, // k=387 + {0.0490676743274177, 0.9987954562051724}, // k=388 + {0.0613207363022086, 0.9981181129001492}, // k=389 + {0.0735645635996670, 0.9972904566786902}, // k=390 + {0.0857973123444399, 0.9963126121827780}, // k=391 + {0.0980171403295601, 0.9951847266721969}, // k=392 + {0.1102222072938829, 0.9939069700023561}, // k=393 + {0.1224106751992156, 0.9924795345987101}, // k=394 + {0.1345807085071260, 0.9909026354277800}, // k=395 + {0.1467304744553619, 0.9891765099647809}, // k=396 + {0.1588581433338612, 0.9873014181578584}, // k=397 + {0.1709618887603013, 0.9852776423889412}, // k=398 + {0.1830398879551406, 0.9831054874312164}, // k=399 + {0.1950903220161283, 0.9807852804032304}, // k=400 + {0.2071113761922181, 0.9783173707196278}, // k=401 + {0.2191012401568697, 0.9757021300385286}, // k=402 + {0.2310581082806706, 0.9729399522055603}, // k=403 + {0.2429801799032638, 0.9700312531945440}, // k=404 + {0.2548656596045140, 0.9669764710448523}, // k=405 + {0.2667127574748982, 0.9637760657954400}, // k=406 + {0.2785196893850533, 0.9604305194155658}, // k=407 + {0.2902846772544621, 0.9569403357322089}, // k=408 + {0.3020059493192281, 0.9533060403541939}, // k=409 + {0.3136817403988911, 0.9495281805930368}, // k=410 + {0.3253102921622629, 0.9456073253805213}, // k=411 + {0.3368898533922196, 0.9415440651830209}, // k=412 + {0.3484186802494345, 0.9373390119125750}, // k=413 + {0.3598950365349876, 0.9329927988347391}, // k=414 + {0.3713171939518374, 0.9285060804732156}, // k=415 + {0.3826834323650900, 0.9238795325112866}, // k=416 + {0.3939920400610479, 0.9191138516900579}, // k=417 + {0.4052413140049900, 0.9142097557035306}, // k=418 + {0.4164295600976369, 0.9091679830905225}, // k=419 + {0.4275550934302821, 0.9039892931234433}, // k=420 + {0.4386162385385273, 0.8986744656939540}, // k=421 + {0.4496113296546066, 0.8932243011955153}, // k=422 + {0.4605387109582396, 0.8876396204028542}, // k=423 + {0.4713967368259976, 0.8819212643483550}, // k=424 + {0.4821837720791222, 0.8760700941954069}, // k=425 + {0.4928981922297839, 0.8700869911087115}, // k=426 + {0.5035383837257178, 0.8639728561215866}, // k=427 + {0.5141027441932216, 0.8577286100002722}, // k=428 + {0.5245896826784691, 0.8513551931052651}, // k=429 + {0.5349976198870969, 0.8448535652497072}, // k=430 + {0.5453249884220465, 0.8382247055548380}, // k=431 + {0.5555702330196018, 0.8314696123025455}, // k=432 + {0.5657318107836131, 0.8245893027850253}, // k=433 + {0.5758081914178449, 0.8175848131515840}, // k=434 + {0.5857978574564388, 0.8104571982525949}, // k=435 + {0.5956993044924329, 0.8032075314806453}, // k=436 + {0.6055110414043253, 0.7958369046088837}, // k=437 + {0.6152315905806270, 0.7883464276266061}, // k=438 + {0.6248594881423861, 0.7807372285720946}, // k=439 + {0.6343932841636456, 0.7730104533627369}, // k=440 + {0.6438315428897912, 0.7651672656224592}, // k=441 + {0.6531728429537768, 0.7572088465064846}, // k=442 + {0.6624157775901715, 0.7491363945234596}, // k=443 + {0.6715589548470183, 0.7409511253549591}, // k=444 + {0.6806009977954527, 0.7326542716724131}, // k=445 + {0.6895405447370668, 0.7242470829514670}, // k=446 + {0.6983762494089724, 0.7157308252838190}, // k=447 + {0.7071067811865474, 0.7071067811865477}, // k=448 + {0.7157308252838188, 0.6983762494089727}, // k=449 + {0.7242470829514667, 0.6895405447370672}, // k=450 + {0.7326542716724129, 0.6806009977954530}, // k=451 + {0.7409511253549589, 0.6715589548470187}, // k=452 + {0.7491363945234594, 0.6624157775901718}, // k=453 + {0.7572088465064842, 0.6531728429537771}, // k=454 + {0.7651672656224588, 0.6438315428897915}, // k=455 + {0.7730104533627367, 0.6343932841636459}, // k=456 + {0.7807372285720944, 0.6248594881423865}, // k=457 + {0.7883464276266059, 0.6152315905806274}, // k=458 + {0.7958369046088833, 0.6055110414043257}, // k=459 + {0.8032075314806451, 0.5956993044924332}, // k=460 + {0.8104571982525947, 0.5857978574564391}, // k=461 + {0.8175848131515837, 0.5758081914178452}, // k=462 + {0.8245893027850251, 0.5657318107836136}, // k=463 + {0.8314696123025452, 0.5555702330196022}, // k=464 + {0.8382247055548377, 0.5453249884220468}, // k=465 + {0.8448535652497070, 0.5349976198870973}, // k=466 + {0.8513551931052649, 0.5245896826784694}, // k=467 + {0.8577286100002720, 0.5141027441932219}, // k=468 + {0.8639728561215864, 0.5035383837257181}, // k=469 + {0.8700869911087113, 0.4928981922297843}, // k=470 + {0.8760700941954067, 0.4821837720791226}, // k=471 + {0.8819212643483548, 0.4713967368259979}, // k=472 + {0.8876396204028539, 0.4605387109582399}, // k=473 + {0.8932243011955151, 0.4496113296546070}, // k=474 + {0.8986744656939538, 0.4386162385385277}, // k=475 + {0.9039892931234431, 0.4275550934302825}, // k=476 + {0.9091679830905224, 0.4164295600976373}, // k=477 + {0.9142097557035305, 0.4052413140049904}, // k=478 + {0.9191138516900577, 0.3939920400610483}, // k=479 + {0.9238795325112865, 0.3826834323650904}, // k=480 + {0.9285060804732155, 0.3713171939518378}, // k=481 + {0.9329927988347390, 0.3598950365349880}, // k=482 + {0.9373390119125748, 0.3484186802494349}, // k=483 + {0.9415440651830208, 0.3368898533922200}, // k=484 + {0.9456073253805212, 0.3253102921622634}, // k=485 + {0.9495281805930367, 0.3136817403988915}, // k=486 + {0.9533060403541936, 0.3020059493192286}, // k=487 + {0.9569403357322088, 0.2902846772544625}, // k=488 + {0.9604305194155657, 0.2785196893850537}, // k=489 + {0.9637760657954398, 0.2667127574748986}, // k=490 + {0.9669764710448522, 0.2548656596045144}, // k=491 + {0.9700312531945440, 0.2429801799032642}, // k=492 + {0.9729399522055602, 0.2310581082806710}, // k=493 + {0.9757021300385285, 0.2191012401568702}, // k=494 + {0.9783173707196277, 0.2071113761922185}, // k=495 + {0.9807852804032303, 0.1950903220161287}, // k=496 + {0.9831054874312163, 0.1830398879551410}, // k=497 + {0.9852776423889411, 0.1709618887603018}, // k=498 + {0.9873014181578583, 0.1588581433338616}, // k=499 + {0.9891765099647809, 0.1467304744553624}, // k=500 + {0.9909026354277800, 0.1345807085071264}, // k=501 + {0.9924795345987100, 0.1224106751992160}, // k=502 + {0.9939069700023561, 0.1102222072938834}, // k=503 + {0.9951847266721969, 0.0980171403295605}, // k=504 + {0.9963126121827780, 0.0857973123444403}, // k=505 + {0.9972904566786902, 0.0735645635996674}, // k=506 + {0.9981181129001492, 0.0613207363022091}, // k=507 + {0.9987954562051724, 0.0490676743274181}, // k=508 + {0.9993223845883494, 0.0368072229413594}, // k=509 + {0.9996988186962042, 0.0245412285229124}, // k=510 + {0.9999247018391445, 0.0122715382857206}, // k=511 +}; + +__constant__ double2 c_twiddle_inv_512[512] = { + {1.0000000000000000, 0.0000000000000000}, // k=0 + {0.9999247018391445, 0.0122715382857199}, // k=1 + {0.9996988186962042, 0.0245412285229123}, // k=2 + {0.9993223845883495, 0.0368072229413588}, // k=3 + {0.9987954562051724, 0.0490676743274180}, // k=4 + {0.9981181129001492, 0.0613207363022086}, // k=5 + {0.9972904566786902, 0.0735645635996674}, // k=6 + {0.9963126121827780, 0.0857973123444399}, // k=7 + {0.9951847266721969, 0.0980171403295606}, // k=8 + {0.9939069700023561, 0.1102222072938831}, // k=9 + {0.9924795345987100, 0.1224106751992162}, // k=10 + {0.9909026354277800, 0.1345807085071262}, // k=11 + {0.9891765099647810, 0.1467304744553617}, // k=12 + {0.9873014181578584, 0.1588581433338614}, // k=13 + {0.9852776423889412, 0.1709618887603012}, // k=14 + {0.9831054874312163, 0.1830398879551410}, // k=15 + {0.9807852804032304, 0.1950903220161282}, // k=16 + {0.9783173707196277, 0.2071113761922186}, // k=17 + {0.9757021300385286, 0.2191012401568698}, // k=18 + {0.9729399522055602, 0.2310581082806711}, // k=19 + {0.9700312531945440, 0.2429801799032639}, // k=20 + {0.9669764710448521, 0.2548656596045146}, // k=21 + {0.9637760657954398, 0.2667127574748984}, // k=22 + {0.9604305194155658, 0.2785196893850531}, // k=23 + {0.9569403357322088, 0.2902846772544623}, // k=24 + {0.9533060403541939, 0.3020059493192281}, // k=25 + {0.9495281805930367, 0.3136817403988915}, // k=26 + {0.9456073253805213, 0.3253102921622629}, // k=27 + {0.9415440651830208, 0.3368898533922201}, // k=28 + {0.9373390119125750, 0.3484186802494346}, // k=29 + {0.9329927988347390, 0.3598950365349881}, // k=30 + {0.9285060804732156, 0.3713171939518375}, // k=31 + {0.9238795325112867, 0.3826834323650898}, // k=32 + {0.9191138516900578, 0.3939920400610481}, // k=33 + {0.9142097557035307, 0.4052413140049899}, // k=34 + {0.9091679830905224, 0.4164295600976372}, // k=35 + {0.9039892931234433, 0.4275550934302821}, // k=36 + {0.8986744656939538, 0.4386162385385277}, // k=37 + {0.8932243011955153, 0.4496113296546065}, // k=38 + {0.8876396204028539, 0.4605387109582400}, // k=39 + {0.8819212643483550, 0.4713967368259976}, // k=40 + {0.8760700941954066, 0.4821837720791227}, // k=41 + {0.8700869911087115, 0.4928981922297840}, // k=42 + {0.8639728561215868, 0.5035383837257176}, // k=43 + {0.8577286100002721, 0.5141027441932217}, // k=44 + {0.8513551931052652, 0.5245896826784689}, // k=45 + {0.8448535652497071, 0.5349976198870972}, // k=46 + {0.8382247055548381, 0.5453249884220465}, // k=47 + {0.8314696123025452, 0.5555702330196022}, // k=48 + {0.8245893027850253, 0.5657318107836131}, // k=49 + {0.8175848131515837, 0.5758081914178453}, // k=50 + {0.8104571982525948, 0.5857978574564389}, // k=51 + {0.8032075314806449, 0.5956993044924334}, // k=52 + {0.7958369046088836, 0.6055110414043255}, // k=53 + {0.7883464276266063, 0.6152315905806268}, // k=54 + {0.7807372285720945, 0.6248594881423863}, // k=55 + {0.7730104533627370, 0.6343932841636455}, // k=56 + {0.7651672656224590, 0.6438315428897914}, // k=57 + {0.7572088465064846, 0.6531728429537768}, // k=58 + {0.7491363945234594, 0.6624157775901718}, // k=59 + {0.7409511253549591, 0.6715589548470183}, // k=60 + {0.7326542716724128, 0.6806009977954530}, // k=61 + {0.7242470829514670, 0.6895405447370668}, // k=62 + {0.7157308252838186, 0.6983762494089729}, // k=63 + {0.7071067811865476, 0.7071067811865475}, // k=64 + {0.6983762494089729, 0.7157308252838186}, // k=65 + {0.6895405447370669, 0.7242470829514669}, // k=66 + {0.6806009977954531, 0.7326542716724128}, // k=67 + {0.6715589548470183, 0.7409511253549591}, // k=68 + {0.6624157775901718, 0.7491363945234593}, // k=69 + {0.6531728429537768, 0.7572088465064845}, // k=70 + {0.6438315428897915, 0.7651672656224590}, // k=71 + {0.6343932841636455, 0.7730104533627370}, // k=72 + {0.6248594881423865, 0.7807372285720944}, // k=73 + {0.6152315905806268, 0.7883464276266062}, // k=74 + {0.6055110414043255, 0.7958369046088835}, // k=75 + {0.5956993044924335, 0.8032075314806448}, // k=76 + {0.5857978574564389, 0.8104571982525948}, // k=77 + {0.5758081914178453, 0.8175848131515837}, // k=78 + {0.5657318107836132, 0.8245893027850253}, // k=79 + {0.5555702330196023, 0.8314696123025452}, // k=80 + {0.5453249884220465, 0.8382247055548380}, // k=81 + {0.5349976198870973, 0.8448535652497070}, // k=82 + {0.5245896826784688, 0.8513551931052652}, // k=83 + {0.5141027441932217, 0.8577286100002721}, // k=84 + {0.5035383837257176, 0.8639728561215867}, // k=85 + {0.4928981922297841, 0.8700869911087113}, // k=86 + {0.4821837720791228, 0.8760700941954066}, // k=87 + {0.4713967368259978, 0.8819212643483549}, // k=88 + {0.4605387109582400, 0.8876396204028539}, // k=89 + {0.4496113296546066, 0.8932243011955153}, // k=90 + {0.4386162385385277, 0.8986744656939538}, // k=91 + {0.4275550934302822, 0.9039892931234433}, // k=92 + {0.4164295600976373, 0.9091679830905223}, // k=93 + {0.4052413140049899, 0.9142097557035307}, // k=94 + {0.3939920400610481, 0.9191138516900578}, // k=95 + {0.3826834323650898, 0.9238795325112867}, // k=96 + {0.3713171939518376, 0.9285060804732155}, // k=97 + {0.3598950365349883, 0.9329927988347388}, // k=98 + {0.3484186802494345, 0.9373390119125750}, // k=99 + {0.3368898533922201, 0.9415440651830208}, // k=100 + {0.3253102921622630, 0.9456073253805213}, // k=101 + {0.3136817403988916, 0.9495281805930367}, // k=102 + {0.3020059493192282, 0.9533060403541938}, // k=103 + {0.2902846772544623, 0.9569403357322089}, // k=104 + {0.2785196893850531, 0.9604305194155658}, // k=105 + {0.2667127574748984, 0.9637760657954398}, // k=106 + {0.2548656596045146, 0.9669764710448521}, // k=107 + {0.2429801799032640, 0.9700312531945440}, // k=108 + {0.2310581082806713, 0.9729399522055601}, // k=109 + {0.2191012401568698, 0.9757021300385286}, // k=110 + {0.2071113761922186, 0.9783173707196277}, // k=111 + {0.1950903220161283, 0.9807852804032304}, // k=112 + {0.1830398879551411, 0.9831054874312163}, // k=113 + {0.1709618887603014, 0.9852776423889412}, // k=114 + {0.1588581433338614, 0.9873014181578584}, // k=115 + {0.1467304744553617, 0.9891765099647810}, // k=116 + {0.1345807085071262, 0.9909026354277800}, // k=117 + {0.1224106751992163, 0.9924795345987100}, // k=118 + {0.1102222072938832, 0.9939069700023561}, // k=119 + {0.0980171403295608, 0.9951847266721968}, // k=120 + {0.0857973123444399, 0.9963126121827780}, // k=121 + {0.0735645635996675, 0.9972904566786902}, // k=122 + {0.0613207363022086, 0.9981181129001492}, // k=123 + {0.0490676743274181, 0.9987954562051724}, // k=124 + {0.0368072229413590, 0.9993223845883495}, // k=125 + {0.0245412285229123, 0.9996988186962042}, // k=126 + {0.0122715382857199, 0.9999247018391445}, // k=127 + {0.0000000000000001, 1.0000000000000000}, // k=128 + {-0.0122715382857198, 0.9999247018391445}, // k=129 + {-0.0245412285229121, 0.9996988186962042}, // k=130 + {-0.0368072229413589, 0.9993223845883495}, // k=131 + {-0.0490676743274180, 0.9987954562051724}, // k=132 + {-0.0613207363022085, 0.9981181129001492}, // k=133 + {-0.0735645635996673, 0.9972904566786902}, // k=134 + {-0.0857973123444398, 0.9963126121827780}, // k=135 + {-0.0980171403295606, 0.9951847266721969}, // k=136 + {-0.1102222072938831, 0.9939069700023561}, // k=137 + {-0.1224106751992162, 0.9924795345987100}, // k=138 + {-0.1345807085071261, 0.9909026354277800}, // k=139 + {-0.1467304744553616, 0.9891765099647810}, // k=140 + {-0.1588581433338613, 0.9873014181578584}, // k=141 + {-0.1709618887603012, 0.9852776423889412}, // k=142 + {-0.1830398879551409, 0.9831054874312163}, // k=143 + {-0.1950903220161282, 0.9807852804032304}, // k=144 + {-0.2071113761922184, 0.9783173707196277}, // k=145 + {-0.2191012401568697, 0.9757021300385286}, // k=146 + {-0.2310581082806711, 0.9729399522055602}, // k=147 + {-0.2429801799032639, 0.9700312531945440}, // k=148 + {-0.2548656596045145, 0.9669764710448521}, // k=149 + {-0.2667127574748983, 0.9637760657954398}, // k=150 + {-0.2785196893850529, 0.9604305194155659}, // k=151 + {-0.2902846772544622, 0.9569403357322089}, // k=152 + {-0.3020059493192281, 0.9533060403541939}, // k=153 + {-0.3136817403988914, 0.9495281805930367}, // k=154 + {-0.3253102921622629, 0.9456073253805214}, // k=155 + {-0.3368898533922199, 0.9415440651830208}, // k=156 + {-0.3484186802494344, 0.9373390119125750}, // k=157 + {-0.3598950365349882, 0.9329927988347388}, // k=158 + {-0.3713171939518375, 0.9285060804732156}, // k=159 + {-0.3826834323650897, 0.9238795325112867}, // k=160 + {-0.3939920400610480, 0.9191138516900578}, // k=161 + {-0.4052413140049897, 0.9142097557035307}, // k=162 + {-0.4164295600976370, 0.9091679830905225}, // k=163 + {-0.4275550934302819, 0.9039892931234434}, // k=164 + {-0.4386162385385274, 0.8986744656939539}, // k=165 + {-0.4496113296546067, 0.8932243011955152}, // k=166 + {-0.4605387109582401, 0.8876396204028539}, // k=167 + {-0.4713967368259977, 0.8819212643483550}, // k=168 + {-0.4821837720791227, 0.8760700941954066}, // k=169 + {-0.4928981922297840, 0.8700869911087115}, // k=170 + {-0.5035383837257175, 0.8639728561215868}, // k=171 + {-0.5141027441932217, 0.8577286100002721}, // k=172 + {-0.5245896826784687, 0.8513551931052652}, // k=173 + {-0.5349976198870970, 0.8448535652497072}, // k=174 + {-0.5453249884220462, 0.8382247055548382}, // k=175 + {-0.5555702330196020, 0.8314696123025455}, // k=176 + {-0.5657318107836132, 0.8245893027850252}, // k=177 + {-0.5758081914178453, 0.8175848131515837}, // k=178 + {-0.5857978574564389, 0.8104571982525948}, // k=179 + {-0.5956993044924334, 0.8032075314806449}, // k=180 + {-0.6055110414043254, 0.7958369046088836}, // k=181 + {-0.6152315905806267, 0.7883464276266063}, // k=182 + {-0.6248594881423862, 0.7807372285720946}, // k=183 + {-0.6343932841636454, 0.7730104533627371}, // k=184 + {-0.6438315428897913, 0.7651672656224591}, // k=185 + {-0.6531728429537765, 0.7572088465064847}, // k=186 + {-0.6624157775901719, 0.7491363945234593}, // k=187 + {-0.6715589548470184, 0.7409511253549590}, // k=188 + {-0.6806009977954530, 0.7326542716724128}, // k=189 + {-0.6895405447370669, 0.7242470829514669}, // k=190 + {-0.6983762494089728, 0.7157308252838187}, // k=191 + {-0.7071067811865475, 0.7071067811865476}, // k=192 + {-0.7157308252838186, 0.6983762494089729}, // k=193 + {-0.7242470829514668, 0.6895405447370671}, // k=194 + {-0.7326542716724127, 0.6806009977954532}, // k=195 + {-0.7409511253549589, 0.6715589548470186}, // k=196 + {-0.7491363945234591, 0.6624157775901720}, // k=197 + {-0.7572088465064846, 0.6531728429537766}, // k=198 + {-0.7651672656224590, 0.6438315428897914}, // k=199 + {-0.7730104533627370, 0.6343932841636455}, // k=200 + {-0.7807372285720945, 0.6248594881423863}, // k=201 + {-0.7883464276266062, 0.6152315905806269}, // k=202 + {-0.7958369046088835, 0.6055110414043257}, // k=203 + {-0.8032075314806448, 0.5956993044924335}, // k=204 + {-0.8104571982525947, 0.5857978574564390}, // k=205 + {-0.8175848131515836, 0.5758081914178454}, // k=206 + {-0.8245893027850251, 0.5657318107836135}, // k=207 + {-0.8314696123025453, 0.5555702330196022}, // k=208 + {-0.8382247055548381, 0.5453249884220464}, // k=209 + {-0.8448535652497071, 0.5349976198870972}, // k=210 + {-0.8513551931052652, 0.5245896826784689}, // k=211 + {-0.8577286100002720, 0.5141027441932218}, // k=212 + {-0.8639728561215867, 0.5035383837257177}, // k=213 + {-0.8700869911087113, 0.4928981922297841}, // k=214 + {-0.8760700941954065, 0.4821837720791229}, // k=215 + {-0.8819212643483549, 0.4713967368259979}, // k=216 + {-0.8876396204028538, 0.4605387109582402}, // k=217 + {-0.8932243011955152, 0.4496113296546069}, // k=218 + {-0.8986744656939539, 0.4386162385385275}, // k=219 + {-0.9039892931234433, 0.4275550934302820}, // k=220 + {-0.9091679830905224, 0.4164295600976372}, // k=221 + {-0.9142097557035307, 0.4052413140049899}, // k=222 + {-0.9191138516900578, 0.3939920400610482}, // k=223 + {-0.9238795325112867, 0.3826834323650899}, // k=224 + {-0.9285060804732155, 0.3713171939518377}, // k=225 + {-0.9329927988347388, 0.3598950365349883}, // k=226 + {-0.9373390119125748, 0.3484186802494348}, // k=227 + {-0.9415440651830207, 0.3368898533922203}, // k=228 + {-0.9456073253805212, 0.3253102921622633}, // k=229 + {-0.9495281805930367, 0.3136817403988914}, // k=230 + {-0.9533060403541939, 0.3020059493192280}, // k=231 + {-0.9569403357322088, 0.2902846772544624}, // k=232 + {-0.9604305194155658, 0.2785196893850532}, // k=233 + {-0.9637760657954398, 0.2667127574748985}, // k=234 + {-0.9669764710448521, 0.2548656596045147}, // k=235 + {-0.9700312531945440, 0.2429801799032641}, // k=236 + {-0.9729399522055601, 0.2310581082806713}, // k=237 + {-0.9757021300385285, 0.2191012401568700}, // k=238 + {-0.9783173707196275, 0.2071113761922188}, // k=239 + {-0.9807852804032304, 0.1950903220161286}, // k=240 + {-0.9831054874312163, 0.1830398879551409}, // k=241 + {-0.9852776423889412, 0.1709618887603012}, // k=242 + {-0.9873014181578584, 0.1588581433338615}, // k=243 + {-0.9891765099647810, 0.1467304744553618}, // k=244 + {-0.9909026354277800, 0.1345807085071263}, // k=245 + {-0.9924795345987100, 0.1224106751992163}, // k=246 + {-0.9939069700023561, 0.1102222072938832}, // k=247 + {-0.9951847266721968, 0.0980171403295608}, // k=248 + {-0.9963126121827780, 0.0857973123444402}, // k=249 + {-0.9972904566786902, 0.0735645635996677}, // k=250 + {-0.9981181129001492, 0.0613207363022085}, // k=251 + {-0.9987954562051724, 0.0490676743274180}, // k=252 + {-0.9993223845883495, 0.0368072229413588}, // k=253 + {-0.9996988186962042, 0.0245412285229123}, // k=254 + {-0.9999247018391445, 0.0122715382857200}, // k=255 + {-1.0000000000000000, 0.0000000000000001}, // k=256 + {-0.9999247018391445, -0.0122715382857198}, // k=257 + {-0.9996988186962042, -0.0245412285229121}, // k=258 + {-0.9993223845883495, -0.0368072229413586}, // k=259 + {-0.9987954562051724, -0.0490676743274177}, // k=260 + {-0.9981181129001492, -0.0613207363022082}, // k=261 + {-0.9972904566786902, -0.0735645635996675}, // k=262 + {-0.9963126121827780, -0.0857973123444399}, // k=263 + {-0.9951847266721969, -0.0980171403295606}, // k=264 + {-0.9939069700023561, -0.1102222072938830}, // k=265 + {-0.9924795345987100, -0.1224106751992161}, // k=266 + {-0.9909026354277800, -0.1345807085071261}, // k=267 + {-0.9891765099647810, -0.1467304744553616}, // k=268 + {-0.9873014181578584, -0.1588581433338612}, // k=269 + {-0.9852776423889413, -0.1709618887603010}, // k=270 + {-0.9831054874312164, -0.1830398879551406}, // k=271 + {-0.9807852804032304, -0.1950903220161284}, // k=272 + {-0.9783173707196277, -0.2071113761922186}, // k=273 + {-0.9757021300385286, -0.2191012401568698}, // k=274 + {-0.9729399522055602, -0.2310581082806711}, // k=275 + {-0.9700312531945440, -0.2429801799032638}, // k=276 + {-0.9669764710448522, -0.2548656596045145}, // k=277 + {-0.9637760657954400, -0.2667127574748983}, // k=278 + {-0.9604305194155659, -0.2785196893850529}, // k=279 + {-0.9569403357322089, -0.2902846772544621}, // k=280 + {-0.9533060403541940, -0.3020059493192278}, // k=281 + {-0.9495281805930368, -0.3136817403988912}, // k=282 + {-0.9456073253805213, -0.3253102921622630}, // k=283 + {-0.9415440651830208, -0.3368898533922201}, // k=284 + {-0.9373390119125750, -0.3484186802494346}, // k=285 + {-0.9329927988347390, -0.3598950365349881}, // k=286 + {-0.9285060804732156, -0.3713171939518374}, // k=287 + {-0.9238795325112868, -0.3826834323650897}, // k=288 + {-0.9191138516900578, -0.3939920400610479}, // k=289 + {-0.9142097557035307, -0.4052413140049897}, // k=290 + {-0.9091679830905225, -0.4164295600976369}, // k=291 + {-0.9039892931234434, -0.4275550934302818}, // k=292 + {-0.8986744656939540, -0.4386162385385273}, // k=293 + {-0.8932243011955153, -0.4496113296546067}, // k=294 + {-0.8876396204028539, -0.4605387109582401}, // k=295 + {-0.8819212643483550, -0.4713967368259976}, // k=296 + {-0.8760700941954066, -0.4821837720791227}, // k=297 + {-0.8700869911087115, -0.4928981922297839}, // k=298 + {-0.8639728561215868, -0.5035383837257175}, // k=299 + {-0.8577286100002721, -0.5141027441932216}, // k=300 + {-0.8513551931052653, -0.5245896826784687}, // k=301 + {-0.8448535652497072, -0.5349976198870969}, // k=302 + {-0.8382247055548382, -0.5453249884220461}, // k=303 + {-0.8314696123025455, -0.5555702330196020}, // k=304 + {-0.8245893027850253, -0.5657318107836132}, // k=305 + {-0.8175848131515837, -0.5758081914178453}, // k=306 + {-0.8104571982525948, -0.5857978574564389}, // k=307 + {-0.8032075314806449, -0.5956993044924332}, // k=308 + {-0.7958369046088836, -0.6055110414043254}, // k=309 + {-0.7883464276266063, -0.6152315905806267}, // k=310 + {-0.7807372285720946, -0.6248594881423862}, // k=311 + {-0.7730104533627371, -0.6343932841636453}, // k=312 + {-0.7651672656224591, -0.6438315428897913}, // k=313 + {-0.7572088465064848, -0.6531728429537765}, // k=314 + {-0.7491363945234593, -0.6624157775901718}, // k=315 + {-0.7409511253549591, -0.6715589548470184}, // k=316 + {-0.7326542716724128, -0.6806009977954530}, // k=317 + {-0.7242470829514670, -0.6895405447370668}, // k=318 + {-0.7157308252838187, -0.6983762494089728}, // k=319 + {-0.7071067811865477, -0.7071067811865475}, // k=320 + {-0.6983762494089730, -0.7157308252838185}, // k=321 + {-0.6895405447370671, -0.7242470829514668}, // k=322 + {-0.6806009977954532, -0.7326542716724126}, // k=323 + {-0.6715589548470187, -0.7409511253549589}, // k=324 + {-0.6624157775901720, -0.7491363945234590}, // k=325 + {-0.6531728429537771, -0.7572088465064842}, // k=326 + {-0.6438315428897915, -0.7651672656224590}, // k=327 + {-0.6343932841636459, -0.7730104533627367}, // k=328 + {-0.6248594881423865, -0.7807372285720944}, // k=329 + {-0.6152315905806273, -0.7883464276266059}, // k=330 + {-0.6055110414043257, -0.7958369046088835}, // k=331 + {-0.5956993044924331, -0.8032075314806451}, // k=332 + {-0.5857978574564391, -0.8104571982525947}, // k=333 + {-0.5758081914178452, -0.8175848131515838}, // k=334 + {-0.5657318107836135, -0.8245893027850251}, // k=335 + {-0.5555702330196022, -0.8314696123025452}, // k=336 + {-0.5453249884220468, -0.8382247055548379}, // k=337 + {-0.5349976198870973, -0.8448535652497070}, // k=338 + {-0.5245896826784694, -0.8513551931052649}, // k=339 + {-0.5141027441932218, -0.8577286100002720}, // k=340 + {-0.5035383837257180, -0.8639728561215865}, // k=341 + {-0.4928981922297842, -0.8700869911087113}, // k=342 + {-0.4821837720791226, -0.8760700941954067}, // k=343 + {-0.4713967368259979, -0.8819212643483549}, // k=344 + {-0.4605387109582399, -0.8876396204028540}, // k=345 + {-0.4496113296546069, -0.8932243011955152}, // k=346 + {-0.4386162385385276, -0.8986744656939538}, // k=347 + {-0.4275550934302825, -0.9039892931234431}, // k=348 + {-0.4164295600976372, -0.9091679830905224}, // k=349 + {-0.4052413140049904, -0.9142097557035305}, // k=350 + {-0.3939920400610482, -0.9191138516900577}, // k=351 + {-0.3826834323650903, -0.9238795325112865}, // k=352 + {-0.3713171939518378, -0.9285060804732155}, // k=353 + {-0.3598950365349879, -0.9329927988347390}, // k=354 + {-0.3484186802494348, -0.9373390119125748}, // k=355 + {-0.3368898533922199, -0.9415440651830208}, // k=356 + {-0.3253102921622633, -0.9456073253805212}, // k=357 + {-0.3136817403988915, -0.9495281805930367}, // k=358 + {-0.3020059493192285, -0.9533060403541938}, // k=359 + {-0.2902846772544624, -0.9569403357322088}, // k=360 + {-0.2785196893850536, -0.9604305194155657}, // k=361 + {-0.2667127574748985, -0.9637760657954398}, // k=362 + {-0.2548656596045143, -0.9669764710448522}, // k=363 + {-0.2429801799032641, -0.9700312531945440}, // k=364 + {-0.2310581082806709, -0.9729399522055602}, // k=365 + {-0.2191012401568701, -0.9757021300385285}, // k=366 + {-0.2071113761922185, -0.9783173707196277}, // k=367 + {-0.1950903220161287, -0.9807852804032303}, // k=368 + {-0.1830398879551410, -0.9831054874312163}, // k=369 + {-0.1709618887603017, -0.9852776423889411}, // k=370 + {-0.1588581433338615, -0.9873014181578583}, // k=371 + {-0.1467304744553623, -0.9891765099647809}, // k=372 + {-0.1345807085071264, -0.9909026354277800}, // k=373 + {-0.1224106751992160, -0.9924795345987101}, // k=374 + {-0.1102222072938833, -0.9939069700023561}, // k=375 + {-0.0980171403295605, -0.9951847266721969}, // k=376 + {-0.0857973123444402, -0.9963126121827780}, // k=377 + {-0.0735645635996674, -0.9972904566786902}, // k=378 + {-0.0613207363022090, -0.9981181129001492}, // k=379 + {-0.0490676743274180, -0.9987954562051724}, // k=380 + {-0.0368072229413593, -0.9993223845883494}, // k=381 + {-0.0245412285229124, -0.9996988186962042}, // k=382 + {-0.0122715382857205, -0.9999247018391445}, // k=383 + {-0.0000000000000002, -1.0000000000000000}, // k=384 + {0.0122715382857201, -0.9999247018391445}, // k=385 + {0.0245412285229120, -0.9996988186962042}, // k=386 + {0.0368072229413590, -0.9993223845883495}, // k=387 + {0.0490676743274177, -0.9987954562051724}, // k=388 + {0.0613207363022086, -0.9981181129001492}, // k=389 + {0.0735645635996670, -0.9972904566786902}, // k=390 + {0.0857973123444399, -0.9963126121827780}, // k=391 + {0.0980171403295601, -0.9951847266721969}, // k=392 + {0.1102222072938829, -0.9939069700023561}, // k=393 + {0.1224106751992156, -0.9924795345987101}, // k=394 + {0.1345807085071260, -0.9909026354277800}, // k=395 + {0.1467304744553619, -0.9891765099647809}, // k=396 + {0.1588581433338612, -0.9873014181578584}, // k=397 + {0.1709618887603013, -0.9852776423889412}, // k=398 + {0.1830398879551406, -0.9831054874312164}, // k=399 + {0.1950903220161283, -0.9807852804032304}, // k=400 + {0.2071113761922181, -0.9783173707196278}, // k=401 + {0.2191012401568697, -0.9757021300385286}, // k=402 + {0.2310581082806706, -0.9729399522055603}, // k=403 + {0.2429801799032638, -0.9700312531945440}, // k=404 + {0.2548656596045140, -0.9669764710448523}, // k=405 + {0.2667127574748982, -0.9637760657954400}, // k=406 + {0.2785196893850533, -0.9604305194155658}, // k=407 + {0.2902846772544621, -0.9569403357322089}, // k=408 + {0.3020059493192281, -0.9533060403541939}, // k=409 + {0.3136817403988911, -0.9495281805930368}, // k=410 + {0.3253102921622629, -0.9456073253805213}, // k=411 + {0.3368898533922196, -0.9415440651830209}, // k=412 + {0.3484186802494345, -0.9373390119125750}, // k=413 + {0.3598950365349876, -0.9329927988347391}, // k=414 + {0.3713171939518374, -0.9285060804732156}, // k=415 + {0.3826834323650900, -0.9238795325112866}, // k=416 + {0.3939920400610479, -0.9191138516900579}, // k=417 + {0.4052413140049900, -0.9142097557035306}, // k=418 + {0.4164295600976369, -0.9091679830905225}, // k=419 + {0.4275550934302821, -0.9039892931234433}, // k=420 + {0.4386162385385273, -0.8986744656939540}, // k=421 + {0.4496113296546066, -0.8932243011955153}, // k=422 + {0.4605387109582396, -0.8876396204028542}, // k=423 + {0.4713967368259976, -0.8819212643483550}, // k=424 + {0.4821837720791222, -0.8760700941954069}, // k=425 + {0.4928981922297839, -0.8700869911087115}, // k=426 + {0.5035383837257178, -0.8639728561215866}, // k=427 + {0.5141027441932216, -0.8577286100002722}, // k=428 + {0.5245896826784691, -0.8513551931052651}, // k=429 + {0.5349976198870969, -0.8448535652497072}, // k=430 + {0.5453249884220465, -0.8382247055548380}, // k=431 + {0.5555702330196018, -0.8314696123025455}, // k=432 + {0.5657318107836131, -0.8245893027850253}, // k=433 + {0.5758081914178449, -0.8175848131515840}, // k=434 + {0.5857978574564388, -0.8104571982525949}, // k=435 + {0.5956993044924329, -0.8032075314806453}, // k=436 + {0.6055110414043253, -0.7958369046088837}, // k=437 + {0.6152315905806270, -0.7883464276266061}, // k=438 + {0.6248594881423861, -0.7807372285720946}, // k=439 + {0.6343932841636456, -0.7730104533627369}, // k=440 + {0.6438315428897912, -0.7651672656224592}, // k=441 + {0.6531728429537768, -0.7572088465064846}, // k=442 + {0.6624157775901715, -0.7491363945234596}, // k=443 + {0.6715589548470183, -0.7409511253549591}, // k=444 + {0.6806009977954527, -0.7326542716724131}, // k=445 + {0.6895405447370668, -0.7242470829514670}, // k=446 + {0.6983762494089724, -0.7157308252838190}, // k=447 + {0.7071067811865474, -0.7071067811865477}, // k=448 + {0.7157308252838188, -0.6983762494089727}, // k=449 + {0.7242470829514667, -0.6895405447370672}, // k=450 + {0.7326542716724129, -0.6806009977954530}, // k=451 + {0.7409511253549589, -0.6715589548470187}, // k=452 + {0.7491363945234594, -0.6624157775901718}, // k=453 + {0.7572088465064842, -0.6531728429537771}, // k=454 + {0.7651672656224588, -0.6438315428897915}, // k=455 + {0.7730104533627367, -0.6343932841636459}, // k=456 + {0.7807372285720944, -0.6248594881423865}, // k=457 + {0.7883464276266059, -0.6152315905806274}, // k=458 + {0.7958369046088833, -0.6055110414043257}, // k=459 + {0.8032075314806451, -0.5956993044924332}, // k=460 + {0.8104571982525947, -0.5857978574564391}, // k=461 + {0.8175848131515837, -0.5758081914178452}, // k=462 + {0.8245893027850251, -0.5657318107836136}, // k=463 + {0.8314696123025452, -0.5555702330196022}, // k=464 + {0.8382247055548377, -0.5453249884220468}, // k=465 + {0.8448535652497070, -0.5349976198870973}, // k=466 + {0.8513551931052649, -0.5245896826784694}, // k=467 + {0.8577286100002720, -0.5141027441932219}, // k=468 + {0.8639728561215864, -0.5035383837257181}, // k=469 + {0.8700869911087113, -0.4928981922297843}, // k=470 + {0.8760700941954067, -0.4821837720791226}, // k=471 + {0.8819212643483548, -0.4713967368259979}, // k=472 + {0.8876396204028539, -0.4605387109582399}, // k=473 + {0.8932243011955151, -0.4496113296546070}, // k=474 + {0.8986744656939538, -0.4386162385385277}, // k=475 + {0.9039892931234431, -0.4275550934302825}, // k=476 + {0.9091679830905224, -0.4164295600976373}, // k=477 + {0.9142097557035305, -0.4052413140049904}, // k=478 + {0.9191138516900577, -0.3939920400610483}, // k=479 + {0.9238795325112865, -0.3826834323650904}, // k=480 + {0.9285060804732155, -0.3713171939518378}, // k=481 + {0.9329927988347390, -0.3598950365349880}, // k=482 + {0.9373390119125748, -0.3484186802494349}, // k=483 + {0.9415440651830208, -0.3368898533922200}, // k=484 + {0.9456073253805212, -0.3253102921622634}, // k=485 + {0.9495281805930367, -0.3136817403988915}, // k=486 + {0.9533060403541936, -0.3020059493192286}, // k=487 + {0.9569403357322088, -0.2902846772544625}, // k=488 + {0.9604305194155657, -0.2785196893850537}, // k=489 + {0.9637760657954398, -0.2667127574748986}, // k=490 + {0.9669764710448522, -0.2548656596045144}, // k=491 + {0.9700312531945440, -0.2429801799032642}, // k=492 + {0.9729399522055602, -0.2310581082806710}, // k=493 + {0.9757021300385285, -0.2191012401568702}, // k=494 + {0.9783173707196277, -0.2071113761922185}, // k=495 + {0.9807852804032303, -0.1950903220161287}, // k=496 + {0.9831054874312163, -0.1830398879551410}, // k=497 + {0.9852776423889411, -0.1709618887603018}, // k=498 + {0.9873014181578583, -0.1588581433338616}, // k=499 + {0.9891765099647809, -0.1467304744553624}, // k=500 + {0.9909026354277800, -0.1345807085071264}, // k=501 + {0.9924795345987100, -0.1224106751992160}, // k=502 + {0.9939069700023561, -0.1102222072938834}, // k=503 + {0.9951847266721969, -0.0980171403295605}, // k=504 + {0.9963126121827780, -0.0857973123444403}, // k=505 + {0.9972904566786902, -0.0735645635996674}, // k=506 + {0.9981181129001492, -0.0613207363022091}, // k=507 + {0.9987954562051724, -0.0490676743274181}, // k=508 + {0.9993223845883494, -0.0368072229413594}, // k=509 + {0.9996988186962042, -0.0245412285229124}, // k=510 + {0.9999247018391445, -0.0122715382857206}, // k=511 +}; diff --git a/src/mesh/parallel/shiftedmetric.cxx b/src/mesh/parallel/shiftedmetric.cxx index 382052047d..759c5aaa0a 100644 --- a/src/mesh/parallel/shiftedmetric.cxx +++ b/src/mesh/parallel/shiftedmetric.cxx @@ -17,6 +17,11 @@ #include +#if BOUT_HAS_CUDA +#include +#include +#endif + ShiftedMetric::ShiftedMetric(Mesh& m, CELL_LOC location_in, Field2D zShift_, BoutReal zlength_in, Options* opt) : ParallelTransform(m, opt), location(location_in), zShift(std::move(zShift_)), @@ -38,8 +43,8 @@ void ShiftedMetric::checkInputGrid() { "Should be 'shiftedmetric'."); } } // else: parallel_transform variable not found in grid input, indicates older input - // file or grid from options so must rely on the user having ensured the type is - // correct + // file or grid from options so must rely on the user having ensured the type is + // correct } void ShiftedMetric::outputVars(Options& output_options) { @@ -67,6 +72,7 @@ void ShiftedMetric::cachePhases() { toAlignedPhs = Tensor(mesh.LocalNx, mesh.LocalNy, nmodes); // To/From field aligned phases + // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_ALL")) { int ix = i.x(); int iy = i.y(); @@ -105,6 +111,7 @@ void ShiftedMetric::cachePhases() { // Parallel slice phases -- note we don't shift in the boundaries/guards for (auto& slice : parallel_slice_phases) { + // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_NOY")) { int ix = i.x(); @@ -166,6 +173,7 @@ Field3D ShiftedMetric::shiftZ(const Field3D& f, const Tensor& phs, Field3D result{emptyFrom(f).setDirectionY(y_direction_out)}; + // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D(toString(region))) { shiftZ(&f(i, 0), &phs(i.x(), i.y(), 0), &result(i, 0)); } @@ -196,7 +204,8 @@ FieldPerp ShiftedMetric::shiftZ(const FieldPerp& f, const Tensor& phs, return result; } -void ShiftedMetric::shiftZ(const BoutReal* in, const dcomplex* phs, BoutReal* out) const { +void ShiftedMetric::shiftZ(const BoutReal* in, const dcomplex* phs, BoutReal* out, + int num_batches) const { #if BOUT_HAS_UMPIRE // TODO: This static keyword is a hotfix and should be removed in // future iterations. It is here because otherwise many allocations @@ -208,7 +217,7 @@ void ShiftedMetric::shiftZ(const BoutReal* in, const dcomplex* phs, BoutReal* ou #endif // Take forward FFT - rfft(in, mesh.LocalNz, &cmplx[0]); + rfft(in, mesh.LocalNz * num_batches, &cmplx[0]); // Following is an algorithm approach to write a = a*b where a and b are // vectors of dcomplex. @@ -222,6 +231,267 @@ void ShiftedMetric::shiftZ(const BoutReal* in, const dcomplex* phs, BoutReal* ou irfft(&cmplx[0], mesh.LocalNz, out); // Reverse FFT } +/* NEW CODE */ +// Bit-reversal +__device__ inline unsigned int bit_reverse(unsigned int x, unsigned int log2n) { + unsigned int result = 0; +#pragma unroll + for (unsigned int i = 0; i < log2n; i++) { + result = (result << 1) | (x & 1); + x >>= 1; + } + return result; +} + +// Block-level cooperative FFT +// Multiple threads cooperate on each FFT using shared memory +template +__global__ void +fft_block_cooperative(const BoutReal** __restrict__ in, BoutReal** __restrict__ out, + const double2** __restrict__ blocks_phs, const int Nz_runtime, + const int nmodes, const int batches, const int nblocks) { + + constexpr int LOG2_NZ = __builtin_ctz(NZ); + constexpr double INV_NZ = 1.0 / (double)NZ; + + // Shared memory for FFTS_PER_BLOCK FFTs + // Each FFT needs NZ complex values + __shared__ double2 shared_fft[FFTS_PER_BLOCK][NZ]; + + // Select twiddles based on size + const double2* twiddles; + if constexpr (NZ == 16) { + twiddles = c_twiddle_fwd_16; + } else if constexpr (NZ == 64) { + twiddles = c_twiddle_fwd_64; + } else if constexpr (NZ == 128) { + twiddles = c_twiddle_fwd_128; + } else if constexpr (NZ == 256) { + twiddles = c_twiddle_fwd_256; + } else if constexpr (NZ == 512) { + twiddles = c_twiddle_fwd_512; + } else { + static_assert(NZ == 16 || NZ == 64 || NZ == 128 || NZ == 256 || NZ == 512, + "Unsupported NZ"); + } + + // Each block processes FFTS_PER_BLOCK FFTs + const int fft_id_in_block = + threadIdx.y; // Which FFT this thread works on (0 to FFTS_PER_BLOCK-1) + const int global_fft_id = blockIdx.x * FFTS_PER_BLOCK + fft_id_in_block; + + if (global_fft_id >= nblocks * batches) + return; + + const int block = global_fft_id / batches; + const int batch = global_fft_id % batches; + + const double* __restrict__ in_line = in[block] + batch * NZ; + double* __restrict__ out_line = out[block] + batch * NZ; + const double2* __restrict__ phs = blocks_phs[block]; + + // Thread ID within the FFT computation + const int tid = threadIdx.x; + const int threads_per_fft = blockDim.x; // All threads in x-dimension work on same FFT + + // ===== LOAD INPUT WITH BIT-REVERSAL ===== + // Each thread loads some elements (strided) + for (int i = tid; i < NZ; i += threads_per_fft) { + const unsigned int rev_i = bit_reverse(i, LOG2_NZ); + shared_fft[fft_id_in_block][rev_i].x = in_line[i]; + shared_fft[fft_id_in_block][rev_i].y = 0.0; + } + __syncthreads(); + + // ===== FORWARD FFT: Cooley-Tukey DIT in Shared Memory ===== + for (int stage = 0; stage < LOG2_NZ; ++stage) { + const int m = 1 << (stage + 1); + const int m_half = m >> 1; + + // Each thread processes multiple butterflies + for (int k = tid; k < NZ / 2; k += threads_per_fft) { + const int butterfly_group = k / m_half; + const int j = k % m_half; + const int idx_top = butterfly_group * m + j; + const int idx_bot = idx_top + m_half; + + // Twiddle factor + const int twiddle_k = (j * NZ) / m; + const double wr = twiddles[twiddle_k].x; + const double wi = twiddles[twiddle_k].y; + + // Load from shared memory + const double top_r = shared_fft[fft_id_in_block][idx_top].x; + const double top_i = shared_fft[fft_id_in_block][idx_top].y; + const double bot_r = shared_fft[fft_id_in_block][idx_bot].x; + const double bot_i = shared_fft[fft_id_in_block][idx_bot].y; + + // Butterfly: t = W * bottom + const double t_r = wr * bot_r - wi * bot_i; + const double t_i = wr * bot_i + wi * bot_r; + + // Write back + shared_fft[fft_id_in_block][idx_top].x = top_r + t_r; + shared_fft[fft_id_in_block][idx_top].y = top_i + t_i; + shared_fft[fft_id_in_block][idx_bot].x = top_r - t_r; + shared_fft[fft_id_in_block][idx_bot].y = top_i - t_i; + } + __syncthreads(); + } + + // ===== APPLY PHASE SHIFT ===== + for (int k = tid; k < nmodes; k += threads_per_fft) { + const double2 ph = phs[batch * nmodes + k]; + const double real = shared_fft[fft_id_in_block][k].x; + const double imag = shared_fft[fft_id_in_block][k].y; + shared_fft[fft_id_in_block][k].x = real * ph.x - imag * ph.y; + shared_fft[fft_id_in_block][k].y = real * ph.y + imag * ph.x; + } + + for (int k = tid + nmodes; k < NZ; k += threads_per_fft) { + if (k >= nmodes) { + const int kk = NZ - k; + const double2 tmp = phs[batch * nmodes + kk]; + const double real = shared_fft[fft_id_in_block][k].x; + const double imag = shared_fft[fft_id_in_block][k].y; + shared_fft[fft_id_in_block][k].x = real * tmp.x + imag * tmp.y; + shared_fft[fft_id_in_block][k].y = -real * tmp.y + imag * tmp.x; + } + } + __syncthreads(); + + // ===== INVERSE FFT: Conjugate, FFT, Conjugate ===== + // Conjugate input + for (int i = tid; i < NZ; i += threads_per_fft) { + shared_fft[fft_id_in_block][i].y = -shared_fft[fft_id_in_block][i].y; + } + __syncthreads(); + + // Bit-reverse for inverse + __shared__ double2 temp_fft[FFTS_PER_BLOCK][NZ]; + for (int i = tid; i < NZ; i += threads_per_fft) { + const unsigned int rev_i = bit_reverse(i, LOG2_NZ); + temp_fft[fft_id_in_block][rev_i] = shared_fft[fft_id_in_block][i]; + } + __syncthreads(); + + for (int i = tid; i < NZ; i += threads_per_fft) { + shared_fft[fft_id_in_block][i] = temp_fft[fft_id_in_block][i]; + } + __syncthreads(); + + // Forward FFT again (for inverse) + for (int stage = 0; stage < LOG2_NZ; ++stage) { + const int m = 1 << (stage + 1); + const int m_half = m >> 1; + + for (int k = tid; k < NZ / 2; k += threads_per_fft) { + const int butterfly_group = k / m_half; + const int j = k % m_half; + const int idx_top = butterfly_group * m + j; + const int idx_bot = idx_top + m_half; + + const int twiddle_k = (j * NZ) / m; + const double wr = twiddles[twiddle_k].x; + const double wi = twiddles[twiddle_k].y; + + const double top_r = shared_fft[fft_id_in_block][idx_top].x; + const double top_i = shared_fft[fft_id_in_block][idx_top].y; + const double bot_r = shared_fft[fft_id_in_block][idx_bot].x; + const double bot_i = shared_fft[fft_id_in_block][idx_bot].y; + + const double t_r = wr * bot_r - wi * bot_i; + const double t_i = wr * bot_i + wi * bot_r; + + shared_fft[fft_id_in_block][idx_top].x = top_r + t_r; + shared_fft[fft_id_in_block][idx_top].y = top_i + t_i; + shared_fft[fft_id_in_block][idx_bot].x = top_r - t_r; + shared_fft[fft_id_in_block][idx_bot].y = top_i - t_i; + } + __syncthreads(); + } + + // Store output (conjugate and normalize) + for (int i = tid; i < NZ; i += threads_per_fft) { + out_line[i] = shared_fft[fft_id_in_block][i].x * INV_NZ; + } +} + +// Launcher for block-level cooperative FFT +static void shiftZ_block_fft(Mesh& mesh, const BoutReal** in, BoutReal** out, + const double2** phs, int nblocks, int batches, + cudaStream_t stream = 0) { + int Nz = mesh.LocalNz; + int nmodes = Nz / 2 + 1; + + if ((Nz & (Nz - 1)) != 0) { + fprintf(stderr, "Error: Nz=%d must be power of 2\n", Nz); + return; + } + + const int total_ffts = nblocks * batches; + + if (Nz == 16) { + constexpr int FFTS_PER_BLOCK = 16; + constexpr int THREADS_PER_FFT = 16; // Use 64 threads per FFT + + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 16 x 16 = 256 threads + dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); + + fft_block_cooperative<16, FFTS_PER_BLOCK> + <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + } else if (Nz == 64) { + constexpr int FFTS_PER_BLOCK = 4; + constexpr int THREADS_PER_FFT = 64; // Use 64 threads per FFT + + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 64 x 4 = 256 threads + dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); + + fft_block_cooperative<64, FFTS_PER_BLOCK> + <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + + } else if (Nz == 128) { + constexpr int FFTS_PER_BLOCK = 2; + constexpr int THREADS_PER_FFT = 128; + + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 128 x 2 = 256 threads + dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); + + fft_block_cooperative<128, FFTS_PER_BLOCK> + <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + + } else if (Nz == 256) { + constexpr int FFTS_PER_BLOCK = 1; + constexpr int THREADS_PER_FFT = 256; + + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 256 x 1 = 256 threads + dim3 grid(total_ffts); + + fft_block_cooperative<256, FFTS_PER_BLOCK> + <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + + } else if (Nz == 512) { + constexpr int FFTS_PER_BLOCK = 1; + constexpr int THREADS_PER_FFT = 512; // 512 threads per FFT + + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 512 x 1 = 512 threads + dim3 grid(total_ffts); + + fft_block_cooperative<512, FFTS_PER_BLOCK> + <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + } else { + fprintf(stderr, "Unsupported Nz=%d for block FFT\n", Nz); + throw std::runtime_error("Unsupported Nz for block FFT"); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error(std::string("Block FFT failed: ") + cudaGetErrorString(err)); + } +} + +/* END NEWER CODE */ + void ShiftedMetric::calcParallelSlices(Field3D& f) { if (f.getDirectionY() == YDirectionType::Aligned) { // Cannot calculate parallel slices for field-aligned fields, so return without @@ -231,9 +501,51 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { f.splitParallelSlices(); + auto& region = mesh.getRegion2D("RGN_NOY"); + + static size_t nblocks = region.getBlocks().size(); + if (nblocks != region.getBlocks().size()) { + throw BoutException("Number of blocks changed in ShiftedMetric::calcParallelSlices"); + } + static Array blocks_in(nblocks); + static Array blocks_out(nblocks); + static Array phs_in(nblocks); + for (const auto& phase : parallel_slice_phases) { auto& f_slice = f.ynext(phase.y_offset); f_slice.allocate(); + +#if BOUT_HAS_CUDA + size_t block_idx = 0; + int num_batches = + region.getBlocks().cbegin()->second.ind - region.getBlocks().cbegin()->first.ind; + + for (auto block = region.getBlocks().cbegin(), end = region.getBlocks().cend(); + block < end; ++block) { + auto idx_s = block->first; + auto idx_e = block->second; + int inner_batches = idx_e.ind - idx_s.ind; + if (inner_batches != num_batches) { + throw BoutException( + "Non-uniform number of batches in ShiftedMetric::calcParallelSlices"); + } + const int ix = idx_s.x(); + const int iy = idx_s.y(); + const int iy_offset = iy + phase.y_offset; + + blocks_in[block_idx] = &f(ix, iy_offset, 0); + blocks_out[block_idx] = &f_slice(ix, iy_offset, 0); + phs_in[block_idx] = reinterpret_cast(&phase.phase_shift(ix, iy, 0)); + + block_idx++; + } + + shiftZ_block_fft(mesh, &blocks_in[0], &blocks_out[0], &phs_in[0], nblocks, + num_batches, 0); + + cudaDeviceSynchronize(); +#else + // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_NOY")) { const int ix = i.x(); const int iy = i.y(); @@ -241,6 +553,9 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { shiftZ(&(f(ix, iy_offset, 0)), &(phase.phase_shift(ix, iy, 0)), &(f_slice(ix, iy_offset, 0))); } + //std::cout << "ShiftedMetric::shiftZ " << __FILE__ << " :" << __LINE__ + // << " count = " << count << " each size " << mesh.LocalNz << "\n"; +#endif } } @@ -257,6 +572,7 @@ ShiftedMetric::shiftZ(const Field3D& f, Matrix> f_fft(mesh.LocalNx, mesh.LocalNy); f_fft = Array(nmodes); + // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_ALL")) { int ix = i.x(); int iy = i.y(); @@ -271,6 +587,7 @@ ShiftedMetric::shiftZ(const Field3D& f, current_result.allocate(); current_result.setLocation(f.getLocation()); + // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_NOY")) { // Deep copy the FFT'd field int ix = i.x(); From 40974a9d98859690646e43f9a8e4391a5d1e585e Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 24 Oct 2025 08:33:29 -0700 Subject: [PATCH 21/58] Fixup: add twiddle header in cmake --- CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f0a657fe94..b5beb75898 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -177,6 +177,7 @@ set(BOUT_SOURCES ./include/bout/sys/range.hxx ./include/bout/sys/timer.hxx ./include/bout/sys/type_name.hxx + ./include/bout/twiddle.hxx ./include/bout/sys/uncopyable.hxx ./include/bout/sys/uuid.h ./include/bout/sys/variant.hxx @@ -239,7 +240,7 @@ set(BOUT_SOURCES ./include/bout/invert/laplacexy2.hxx ./src/invert/laplacexy2/laplacexy2.cxx ./include/bout/invert/laplacexy2_hypre.hxx - ./src/invert/laplacexy2/laplacexy2_hypre.cxx + ./src/invert/laplacexy2/laplacexy2_hypre.cxx ./src/invert/laplacexz/impls/cyclic/laplacexz-cyclic.cxx ./src/invert/laplacexz/impls/cyclic/laplacexz-cyclic.hxx ./src/invert/laplacexz/impls/petsc/laplacexz-petsc.cxx @@ -525,7 +526,7 @@ if (BOUT_ENABLE_WARNINGS) $<$,$,$>: -Wall -Wextra > > $<$: - /W4 > + /W4 > $<$:-Xcompiler=-Wall -Xcompiler=-Wextra > ) From ba5eabf34242f8ade482cd7b8486248a44e1f73d Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 24 Oct 2025 08:33:55 -0700 Subject: [PATCH 22/58] Default to pinned memory for performance --- include/bout/array.hxx | 1 + 1 file changed, 1 insertion(+) diff --git a/include/bout/array.hxx b/include/bout/array.hxx index 2c42f15aad..4965aee880 100644 --- a/include/bout/array.hxx +++ b/include/bout/array.hxx @@ -67,6 +67,7 @@ struct ArrayData { auto& rm = umpire::ResourceManager::getInstance(); #if BOUT_HAS_CUDA auto allocator = rm.getAllocator(umpire::resource::Pinned); + //auto allocator = rm.getAllocator(umpire::resource::Unified); #else auto allocator = rm.getAllocator("HOST"); #endif From c079bb6f508efa85b8b73dcf0f5ee9ac5ce3b231 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 24 Oct 2025 09:40:56 -0700 Subject: [PATCH 23/58] Fixup: remove unused twiddles --- include/bout/twiddle.hxx | 1033 +------------------------------------- 1 file changed, 6 insertions(+), 1027 deletions(-) diff --git a/include/bout/twiddle.hxx b/include/bout/twiddle.hxx index ae4f729b48..6da72dd8ff 100644 --- a/include/bout/twiddle.hxx +++ b/include/bout/twiddle.hxx @@ -1,4 +1,4 @@ -__constant__ double2 c_twiddle_fwd_16[16] = { +__constant__ double2 c_twiddle_16[16] = { {1.0000000000000000, -0.0000000000000000}, // k=0 {0.9238795325112867, -0.3826834323650898}, // k=1 {0.7071067811865476, -0.7071067811865475}, // k=2 @@ -17,25 +17,7 @@ __constant__ double2 c_twiddle_fwd_16[16] = { {0.9238795325112865, 0.3826834323650904}, // k=15 }; -__constant__ double2 c_twiddle_inv_16[16] = { - {1.0000000000000000, 0.0000000000000000}, // k=0 - {0.9238795325112867, 0.3826834323650898}, // k=1 - {0.7071067811865476, 0.7071067811865475}, // k=2 - {0.3826834323650898, 0.9238795325112867}, // k=3 - {0.0000000000000001, 1.0000000000000000}, // k=4 - {-0.3826834323650897, 0.9238795325112867}, // k=5 - {-0.7071067811865475, 0.7071067811865476}, // k=6 - {-0.9238795325112867, 0.3826834323650899}, // k=7 - {-1.0000000000000000, 0.0000000000000001}, // k=8 - {-0.9238795325112868, -0.3826834323650897}, // k=9 - {-0.7071067811865477, -0.7071067811865475}, // k=10 - {-0.3826834323650903, -0.9238795325112865}, // k=11 - {-0.0000000000000002, -1.0000000000000000}, // k=12 - {0.3826834323650900, -0.9238795325112866}, // k=13 - {0.7071067811865474, -0.7071067811865477}, // k=14 - {0.9238795325112865, -0.3826834323650904}, // k=15 -}; -__constant__ double2 c_twiddle_fwd_32[32] = { +__constant__ double2 c_twiddle_32[32] = { {1.0000000000000000, -0.0000000000000000}, // k=0 {0.9807852804032304, -0.1950903220161282}, // k=1 {0.9238795325112867, -0.3826834323650898}, // k=2 @@ -70,41 +52,7 @@ __constant__ double2 c_twiddle_fwd_32[32] = { {0.9807852804032303, 0.1950903220161287}, // k=31 }; -__constant__ double2 c_twiddle_inv_32[32] = { - {1.0000000000000000, 0.0000000000000000}, // k=0 - {0.9807852804032304, 0.1950903220161282}, // k=1 - {0.9238795325112867, 0.3826834323650898}, // k=2 - {0.8314696123025452, 0.5555702330196022}, // k=3 - {0.7071067811865476, 0.7071067811865475}, // k=4 - {0.5555702330196023, 0.8314696123025452}, // k=5 - {0.3826834323650898, 0.9238795325112867}, // k=6 - {0.1950903220161283, 0.9807852804032304}, // k=7 - {0.0000000000000001, 1.0000000000000000}, // k=8 - {-0.1950903220161282, 0.9807852804032304}, // k=9 - {-0.3826834323650897, 0.9238795325112867}, // k=10 - {-0.5555702330196020, 0.8314696123025455}, // k=11 - {-0.7071067811865475, 0.7071067811865476}, // k=12 - {-0.8314696123025453, 0.5555702330196022}, // k=13 - {-0.9238795325112867, 0.3826834323650899}, // k=14 - {-0.9807852804032304, 0.1950903220161286}, // k=15 - {-1.0000000000000000, 0.0000000000000001}, // k=16 - {-0.9807852804032304, -0.1950903220161284}, // k=17 - {-0.9238795325112868, -0.3826834323650897}, // k=18 - {-0.8314696123025455, -0.5555702330196020}, // k=19 - {-0.7071067811865477, -0.7071067811865475}, // k=20 - {-0.5555702330196022, -0.8314696123025452}, // k=21 - {-0.3826834323650903, -0.9238795325112865}, // k=22 - {-0.1950903220161287, -0.9807852804032303}, // k=23 - {-0.0000000000000002, -1.0000000000000000}, // k=24 - {0.1950903220161283, -0.9807852804032304}, // k=25 - {0.3826834323650900, -0.9238795325112866}, // k=26 - {0.5555702330196018, -0.8314696123025455}, // k=27 - {0.7071067811865474, -0.7071067811865477}, // k=28 - {0.8314696123025452, -0.5555702330196022}, // k=29 - {0.9238795325112865, -0.3826834323650904}, // k=30 - {0.9807852804032303, -0.1950903220161287}, // k=31 -}; -__constant__ double2 c_twiddle_fwd_64[64] = { +__constant__ double2 c_twiddle_64[64] = { {1.0000000000000000, -0.0000000000000000}, // k=0 {0.9951847266721969, -0.0980171403295606}, // k=1 {0.9807852804032304, -0.1950903220161282}, // k=2 @@ -171,73 +119,7 @@ __constant__ double2 c_twiddle_fwd_64[64] = { {0.9951847266721969, 0.0980171403295605}, // k=63 }; -__constant__ double2 c_twiddle_inv_64[64] = { - {1.0000000000000000, 0.0000000000000000}, // k=0 - {0.9951847266721969, 0.0980171403295606}, // k=1 - {0.9807852804032304, 0.1950903220161282}, // k=2 - {0.9569403357322088, 0.2902846772544623}, // k=3 - {0.9238795325112867, 0.3826834323650898}, // k=4 - {0.8819212643483550, 0.4713967368259976}, // k=5 - {0.8314696123025452, 0.5555702330196022}, // k=6 - {0.7730104533627370, 0.6343932841636455}, // k=7 - {0.7071067811865476, 0.7071067811865475}, // k=8 - {0.6343932841636455, 0.7730104533627370}, // k=9 - {0.5555702330196023, 0.8314696123025452}, // k=10 - {0.4713967368259978, 0.8819212643483549}, // k=11 - {0.3826834323650898, 0.9238795325112867}, // k=12 - {0.2902846772544623, 0.9569403357322089}, // k=13 - {0.1950903220161283, 0.9807852804032304}, // k=14 - {0.0980171403295608, 0.9951847266721968}, // k=15 - {0.0000000000000001, 1.0000000000000000}, // k=16 - {-0.0980171403295606, 0.9951847266721969}, // k=17 - {-0.1950903220161282, 0.9807852804032304}, // k=18 - {-0.2902846772544622, 0.9569403357322089}, // k=19 - {-0.3826834323650897, 0.9238795325112867}, // k=20 - {-0.4713967368259977, 0.8819212643483550}, // k=21 - {-0.5555702330196020, 0.8314696123025455}, // k=22 - {-0.6343932841636454, 0.7730104533627371}, // k=23 - {-0.7071067811865475, 0.7071067811865476}, // k=24 - {-0.7730104533627370, 0.6343932841636455}, // k=25 - {-0.8314696123025453, 0.5555702330196022}, // k=26 - {-0.8819212643483549, 0.4713967368259979}, // k=27 - {-0.9238795325112867, 0.3826834323650899}, // k=28 - {-0.9569403357322088, 0.2902846772544624}, // k=29 - {-0.9807852804032304, 0.1950903220161286}, // k=30 - {-0.9951847266721968, 0.0980171403295608}, // k=31 - {-1.0000000000000000, 0.0000000000000001}, // k=32 - {-0.9951847266721969, -0.0980171403295606}, // k=33 - {-0.9807852804032304, -0.1950903220161284}, // k=34 - {-0.9569403357322089, -0.2902846772544621}, // k=35 - {-0.9238795325112868, -0.3826834323650897}, // k=36 - {-0.8819212643483550, -0.4713967368259976}, // k=37 - {-0.8314696123025455, -0.5555702330196020}, // k=38 - {-0.7730104533627371, -0.6343932841636453}, // k=39 - {-0.7071067811865477, -0.7071067811865475}, // k=40 - {-0.6343932841636459, -0.7730104533627367}, // k=41 - {-0.5555702330196022, -0.8314696123025452}, // k=42 - {-0.4713967368259979, -0.8819212643483549}, // k=43 - {-0.3826834323650903, -0.9238795325112865}, // k=44 - {-0.2902846772544624, -0.9569403357322088}, // k=45 - {-0.1950903220161287, -0.9807852804032303}, // k=46 - {-0.0980171403295605, -0.9951847266721969}, // k=47 - {-0.0000000000000002, -1.0000000000000000}, // k=48 - {0.0980171403295601, -0.9951847266721969}, // k=49 - {0.1950903220161283, -0.9807852804032304}, // k=50 - {0.2902846772544621, -0.9569403357322089}, // k=51 - {0.3826834323650900, -0.9238795325112866}, // k=52 - {0.4713967368259976, -0.8819212643483550}, // k=53 - {0.5555702330196018, -0.8314696123025455}, // k=54 - {0.6343932841636456, -0.7730104533627369}, // k=55 - {0.7071067811865474, -0.7071067811865477}, // k=56 - {0.7730104533627367, -0.6343932841636459}, // k=57 - {0.8314696123025452, -0.5555702330196022}, // k=58 - {0.8819212643483548, -0.4713967368259979}, // k=59 - {0.9238795325112865, -0.3826834323650904}, // k=60 - {0.9569403357322088, -0.2902846772544625}, // k=61 - {0.9807852804032303, -0.1950903220161287}, // k=62 - {0.9951847266721969, -0.0980171403295605}, // k=63 -}; -__constant__ double2 c_twiddle_fwd_128[128] = { +__constant__ double2 c_twiddle_128[128] = { {1.0000000000000000, -0.0000000000000000}, // k=0 {0.9987954562051724, -0.0490676743274180}, // k=1 {0.9951847266721969, -0.0980171403295606}, // k=2 @@ -368,137 +250,7 @@ __constant__ double2 c_twiddle_fwd_128[128] = { {0.9987954562051724, 0.0490676743274181}, // k=127 }; -__constant__ double2 c_twiddle_inv_128[128] = { - {1.0000000000000000, 0.0000000000000000}, // k=0 - {0.9987954562051724, 0.0490676743274180}, // k=1 - {0.9951847266721969, 0.0980171403295606}, // k=2 - {0.9891765099647810, 0.1467304744553617}, // k=3 - {0.9807852804032304, 0.1950903220161282}, // k=4 - {0.9700312531945440, 0.2429801799032639}, // k=5 - {0.9569403357322088, 0.2902846772544623}, // k=6 - {0.9415440651830208, 0.3368898533922201}, // k=7 - {0.9238795325112867, 0.3826834323650898}, // k=8 - {0.9039892931234433, 0.4275550934302821}, // k=9 - {0.8819212643483550, 0.4713967368259976}, // k=10 - {0.8577286100002721, 0.5141027441932217}, // k=11 - {0.8314696123025452, 0.5555702330196022}, // k=12 - {0.8032075314806449, 0.5956993044924334}, // k=13 - {0.7730104533627370, 0.6343932841636455}, // k=14 - {0.7409511253549591, 0.6715589548470183}, // k=15 - {0.7071067811865476, 0.7071067811865475}, // k=16 - {0.6715589548470183, 0.7409511253549591}, // k=17 - {0.6343932841636455, 0.7730104533627370}, // k=18 - {0.5956993044924335, 0.8032075314806448}, // k=19 - {0.5555702330196023, 0.8314696123025452}, // k=20 - {0.5141027441932217, 0.8577286100002721}, // k=21 - {0.4713967368259978, 0.8819212643483549}, // k=22 - {0.4275550934302822, 0.9039892931234433}, // k=23 - {0.3826834323650898, 0.9238795325112867}, // k=24 - {0.3368898533922201, 0.9415440651830208}, // k=25 - {0.2902846772544623, 0.9569403357322089}, // k=26 - {0.2429801799032640, 0.9700312531945440}, // k=27 - {0.1950903220161283, 0.9807852804032304}, // k=28 - {0.1467304744553617, 0.9891765099647810}, // k=29 - {0.0980171403295608, 0.9951847266721968}, // k=30 - {0.0490676743274181, 0.9987954562051724}, // k=31 - {0.0000000000000001, 1.0000000000000000}, // k=32 - {-0.0490676743274180, 0.9987954562051724}, // k=33 - {-0.0980171403295606, 0.9951847266721969}, // k=34 - {-0.1467304744553616, 0.9891765099647810}, // k=35 - {-0.1950903220161282, 0.9807852804032304}, // k=36 - {-0.2429801799032639, 0.9700312531945440}, // k=37 - {-0.2902846772544622, 0.9569403357322089}, // k=38 - {-0.3368898533922199, 0.9415440651830208}, // k=39 - {-0.3826834323650897, 0.9238795325112867}, // k=40 - {-0.4275550934302819, 0.9039892931234434}, // k=41 - {-0.4713967368259977, 0.8819212643483550}, // k=42 - {-0.5141027441932217, 0.8577286100002721}, // k=43 - {-0.5555702330196020, 0.8314696123025455}, // k=44 - {-0.5956993044924334, 0.8032075314806449}, // k=45 - {-0.6343932841636454, 0.7730104533627371}, // k=46 - {-0.6715589548470184, 0.7409511253549590}, // k=47 - {-0.7071067811865475, 0.7071067811865476}, // k=48 - {-0.7409511253549589, 0.6715589548470186}, // k=49 - {-0.7730104533627370, 0.6343932841636455}, // k=50 - {-0.8032075314806448, 0.5956993044924335}, // k=51 - {-0.8314696123025453, 0.5555702330196022}, // k=52 - {-0.8577286100002720, 0.5141027441932218}, // k=53 - {-0.8819212643483549, 0.4713967368259979}, // k=54 - {-0.9039892931234433, 0.4275550934302820}, // k=55 - {-0.9238795325112867, 0.3826834323650899}, // k=56 - {-0.9415440651830207, 0.3368898533922203}, // k=57 - {-0.9569403357322088, 0.2902846772544624}, // k=58 - {-0.9700312531945440, 0.2429801799032641}, // k=59 - {-0.9807852804032304, 0.1950903220161286}, // k=60 - {-0.9891765099647810, 0.1467304744553618}, // k=61 - {-0.9951847266721968, 0.0980171403295608}, // k=62 - {-0.9987954562051724, 0.0490676743274180}, // k=63 - {-1.0000000000000000, 0.0000000000000001}, // k=64 - {-0.9987954562051724, -0.0490676743274177}, // k=65 - {-0.9951847266721969, -0.0980171403295606}, // k=66 - {-0.9891765099647810, -0.1467304744553616}, // k=67 - {-0.9807852804032304, -0.1950903220161284}, // k=68 - {-0.9700312531945440, -0.2429801799032638}, // k=69 - {-0.9569403357322089, -0.2902846772544621}, // k=70 - {-0.9415440651830208, -0.3368898533922201}, // k=71 - {-0.9238795325112868, -0.3826834323650897}, // k=72 - {-0.9039892931234434, -0.4275550934302818}, // k=73 - {-0.8819212643483550, -0.4713967368259976}, // k=74 - {-0.8577286100002721, -0.5141027441932216}, // k=75 - {-0.8314696123025455, -0.5555702330196020}, // k=76 - {-0.8032075314806449, -0.5956993044924332}, // k=77 - {-0.7730104533627371, -0.6343932841636453}, // k=78 - {-0.7409511253549591, -0.6715589548470184}, // k=79 - {-0.7071067811865477, -0.7071067811865475}, // k=80 - {-0.6715589548470187, -0.7409511253549589}, // k=81 - {-0.6343932841636459, -0.7730104533627367}, // k=82 - {-0.5956993044924331, -0.8032075314806451}, // k=83 - {-0.5555702330196022, -0.8314696123025452}, // k=84 - {-0.5141027441932218, -0.8577286100002720}, // k=85 - {-0.4713967368259979, -0.8819212643483549}, // k=86 - {-0.4275550934302825, -0.9039892931234431}, // k=87 - {-0.3826834323650903, -0.9238795325112865}, // k=88 - {-0.3368898533922199, -0.9415440651830208}, // k=89 - {-0.2902846772544624, -0.9569403357322088}, // k=90 - {-0.2429801799032641, -0.9700312531945440}, // k=91 - {-0.1950903220161287, -0.9807852804032303}, // k=92 - {-0.1467304744553623, -0.9891765099647809}, // k=93 - {-0.0980171403295605, -0.9951847266721969}, // k=94 - {-0.0490676743274180, -0.9987954562051724}, // k=95 - {-0.0000000000000002, -1.0000000000000000}, // k=96 - {0.0490676743274177, -0.9987954562051724}, // k=97 - {0.0980171403295601, -0.9951847266721969}, // k=98 - {0.1467304744553619, -0.9891765099647809}, // k=99 - {0.1950903220161283, -0.9807852804032304}, // k=100 - {0.2429801799032638, -0.9700312531945440}, // k=101 - {0.2902846772544621, -0.9569403357322089}, // k=102 - {0.3368898533922196, -0.9415440651830209}, // k=103 - {0.3826834323650900, -0.9238795325112866}, // k=104 - {0.4275550934302821, -0.9039892931234433}, // k=105 - {0.4713967368259976, -0.8819212643483550}, // k=106 - {0.5141027441932216, -0.8577286100002722}, // k=107 - {0.5555702330196018, -0.8314696123025455}, // k=108 - {0.5956993044924329, -0.8032075314806453}, // k=109 - {0.6343932841636456, -0.7730104533627369}, // k=110 - {0.6715589548470183, -0.7409511253549591}, // k=111 - {0.7071067811865474, -0.7071067811865477}, // k=112 - {0.7409511253549589, -0.6715589548470187}, // k=113 - {0.7730104533627367, -0.6343932841636459}, // k=114 - {0.8032075314806451, -0.5956993044924332}, // k=115 - {0.8314696123025452, -0.5555702330196022}, // k=116 - {0.8577286100002720, -0.5141027441932219}, // k=117 - {0.8819212643483548, -0.4713967368259979}, // k=118 - {0.9039892931234431, -0.4275550934302825}, // k=119 - {0.9238795325112865, -0.3826834323650904}, // k=120 - {0.9415440651830208, -0.3368898533922200}, // k=121 - {0.9569403357322088, -0.2902846772544625}, // k=122 - {0.9700312531945440, -0.2429801799032642}, // k=123 - {0.9807852804032303, -0.1950903220161287}, // k=124 - {0.9891765099647809, -0.1467304744553624}, // k=125 - {0.9951847266721969, -0.0980171403295605}, // k=126 - {0.9987954562051724, -0.0490676743274181}, // k=127 -}; -__constant__ double2 c_twiddle_fwd_256[256] = { +__constant__ double2 c_twiddle_256[256] = { {1.0000000000000000, -0.0000000000000000}, // k=0 {0.9996988186962042, -0.0245412285229123}, // k=1 {0.9987954562051724, -0.0490676743274180}, // k=2 @@ -757,265 +509,7 @@ __constant__ double2 c_twiddle_fwd_256[256] = { {0.9996988186962042, 0.0245412285229124}, // k=255 }; -__constant__ double2 c_twiddle_inv_256[256] = { - {1.0000000000000000, 0.0000000000000000}, // k=0 - {0.9996988186962042, 0.0245412285229123}, // k=1 - {0.9987954562051724, 0.0490676743274180}, // k=2 - {0.9972904566786902, 0.0735645635996674}, // k=3 - {0.9951847266721969, 0.0980171403295606}, // k=4 - {0.9924795345987100, 0.1224106751992162}, // k=5 - {0.9891765099647810, 0.1467304744553617}, // k=6 - {0.9852776423889412, 0.1709618887603012}, // k=7 - {0.9807852804032304, 0.1950903220161282}, // k=8 - {0.9757021300385286, 0.2191012401568698}, // k=9 - {0.9700312531945440, 0.2429801799032639}, // k=10 - {0.9637760657954398, 0.2667127574748984}, // k=11 - {0.9569403357322088, 0.2902846772544623}, // k=12 - {0.9495281805930367, 0.3136817403988915}, // k=13 - {0.9415440651830208, 0.3368898533922201}, // k=14 - {0.9329927988347390, 0.3598950365349881}, // k=15 - {0.9238795325112867, 0.3826834323650898}, // k=16 - {0.9142097557035307, 0.4052413140049899}, // k=17 - {0.9039892931234433, 0.4275550934302821}, // k=18 - {0.8932243011955153, 0.4496113296546065}, // k=19 - {0.8819212643483550, 0.4713967368259976}, // k=20 - {0.8700869911087115, 0.4928981922297840}, // k=21 - {0.8577286100002721, 0.5141027441932217}, // k=22 - {0.8448535652497071, 0.5349976198870972}, // k=23 - {0.8314696123025452, 0.5555702330196022}, // k=24 - {0.8175848131515837, 0.5758081914178453}, // k=25 - {0.8032075314806449, 0.5956993044924334}, // k=26 - {0.7883464276266063, 0.6152315905806268}, // k=27 - {0.7730104533627370, 0.6343932841636455}, // k=28 - {0.7572088465064846, 0.6531728429537768}, // k=29 - {0.7409511253549591, 0.6715589548470183}, // k=30 - {0.7242470829514670, 0.6895405447370668}, // k=31 - {0.7071067811865476, 0.7071067811865475}, // k=32 - {0.6895405447370669, 0.7242470829514669}, // k=33 - {0.6715589548470183, 0.7409511253549591}, // k=34 - {0.6531728429537768, 0.7572088465064845}, // k=35 - {0.6343932841636455, 0.7730104533627370}, // k=36 - {0.6152315905806268, 0.7883464276266062}, // k=37 - {0.5956993044924335, 0.8032075314806448}, // k=38 - {0.5758081914178453, 0.8175848131515837}, // k=39 - {0.5555702330196023, 0.8314696123025452}, // k=40 - {0.5349976198870973, 0.8448535652497070}, // k=41 - {0.5141027441932217, 0.8577286100002721}, // k=42 - {0.4928981922297841, 0.8700869911087113}, // k=43 - {0.4713967368259978, 0.8819212643483549}, // k=44 - {0.4496113296546066, 0.8932243011955153}, // k=45 - {0.4275550934302822, 0.9039892931234433}, // k=46 - {0.4052413140049899, 0.9142097557035307}, // k=47 - {0.3826834323650898, 0.9238795325112867}, // k=48 - {0.3598950365349883, 0.9329927988347388}, // k=49 - {0.3368898533922201, 0.9415440651830208}, // k=50 - {0.3136817403988916, 0.9495281805930367}, // k=51 - {0.2902846772544623, 0.9569403357322089}, // k=52 - {0.2667127574748984, 0.9637760657954398}, // k=53 - {0.2429801799032640, 0.9700312531945440}, // k=54 - {0.2191012401568698, 0.9757021300385286}, // k=55 - {0.1950903220161283, 0.9807852804032304}, // k=56 - {0.1709618887603014, 0.9852776423889412}, // k=57 - {0.1467304744553617, 0.9891765099647810}, // k=58 - {0.1224106751992163, 0.9924795345987100}, // k=59 - {0.0980171403295608, 0.9951847266721968}, // k=60 - {0.0735645635996675, 0.9972904566786902}, // k=61 - {0.0490676743274181, 0.9987954562051724}, // k=62 - {0.0245412285229123, 0.9996988186962042}, // k=63 - {0.0000000000000001, 1.0000000000000000}, // k=64 - {-0.0245412285229121, 0.9996988186962042}, // k=65 - {-0.0490676743274180, 0.9987954562051724}, // k=66 - {-0.0735645635996673, 0.9972904566786902}, // k=67 - {-0.0980171403295606, 0.9951847266721969}, // k=68 - {-0.1224106751992162, 0.9924795345987100}, // k=69 - {-0.1467304744553616, 0.9891765099647810}, // k=70 - {-0.1709618887603012, 0.9852776423889412}, // k=71 - {-0.1950903220161282, 0.9807852804032304}, // k=72 - {-0.2191012401568697, 0.9757021300385286}, // k=73 - {-0.2429801799032639, 0.9700312531945440}, // k=74 - {-0.2667127574748983, 0.9637760657954398}, // k=75 - {-0.2902846772544622, 0.9569403357322089}, // k=76 - {-0.3136817403988914, 0.9495281805930367}, // k=77 - {-0.3368898533922199, 0.9415440651830208}, // k=78 - {-0.3598950365349882, 0.9329927988347388}, // k=79 - {-0.3826834323650897, 0.9238795325112867}, // k=80 - {-0.4052413140049897, 0.9142097557035307}, // k=81 - {-0.4275550934302819, 0.9039892931234434}, // k=82 - {-0.4496113296546067, 0.8932243011955152}, // k=83 - {-0.4713967368259977, 0.8819212643483550}, // k=84 - {-0.4928981922297840, 0.8700869911087115}, // k=85 - {-0.5141027441932217, 0.8577286100002721}, // k=86 - {-0.5349976198870970, 0.8448535652497072}, // k=87 - {-0.5555702330196020, 0.8314696123025455}, // k=88 - {-0.5758081914178453, 0.8175848131515837}, // k=89 - {-0.5956993044924334, 0.8032075314806449}, // k=90 - {-0.6152315905806267, 0.7883464276266063}, // k=91 - {-0.6343932841636454, 0.7730104533627371}, // k=92 - {-0.6531728429537765, 0.7572088465064847}, // k=93 - {-0.6715589548470184, 0.7409511253549590}, // k=94 - {-0.6895405447370669, 0.7242470829514669}, // k=95 - {-0.7071067811865475, 0.7071067811865476}, // k=96 - {-0.7242470829514668, 0.6895405447370671}, // k=97 - {-0.7409511253549589, 0.6715589548470186}, // k=98 - {-0.7572088465064846, 0.6531728429537766}, // k=99 - {-0.7730104533627370, 0.6343932841636455}, // k=100 - {-0.7883464276266062, 0.6152315905806269}, // k=101 - {-0.8032075314806448, 0.5956993044924335}, // k=102 - {-0.8175848131515836, 0.5758081914178454}, // k=103 - {-0.8314696123025453, 0.5555702330196022}, // k=104 - {-0.8448535652497071, 0.5349976198870972}, // k=105 - {-0.8577286100002720, 0.5141027441932218}, // k=106 - {-0.8700869911087113, 0.4928981922297841}, // k=107 - {-0.8819212643483549, 0.4713967368259979}, // k=108 - {-0.8932243011955152, 0.4496113296546069}, // k=109 - {-0.9039892931234433, 0.4275550934302820}, // k=110 - {-0.9142097557035307, 0.4052413140049899}, // k=111 - {-0.9238795325112867, 0.3826834323650899}, // k=112 - {-0.9329927988347388, 0.3598950365349883}, // k=113 - {-0.9415440651830207, 0.3368898533922203}, // k=114 - {-0.9495281805930367, 0.3136817403988914}, // k=115 - {-0.9569403357322088, 0.2902846772544624}, // k=116 - {-0.9637760657954398, 0.2667127574748985}, // k=117 - {-0.9700312531945440, 0.2429801799032641}, // k=118 - {-0.9757021300385285, 0.2191012401568700}, // k=119 - {-0.9807852804032304, 0.1950903220161286}, // k=120 - {-0.9852776423889412, 0.1709618887603012}, // k=121 - {-0.9891765099647810, 0.1467304744553618}, // k=122 - {-0.9924795345987100, 0.1224106751992163}, // k=123 - {-0.9951847266721968, 0.0980171403295608}, // k=124 - {-0.9972904566786902, 0.0735645635996677}, // k=125 - {-0.9987954562051724, 0.0490676743274180}, // k=126 - {-0.9996988186962042, 0.0245412285229123}, // k=127 - {-1.0000000000000000, 0.0000000000000001}, // k=128 - {-0.9996988186962042, -0.0245412285229121}, // k=129 - {-0.9987954562051724, -0.0490676743274177}, // k=130 - {-0.9972904566786902, -0.0735645635996675}, // k=131 - {-0.9951847266721969, -0.0980171403295606}, // k=132 - {-0.9924795345987100, -0.1224106751992161}, // k=133 - {-0.9891765099647810, -0.1467304744553616}, // k=134 - {-0.9852776423889413, -0.1709618887603010}, // k=135 - {-0.9807852804032304, -0.1950903220161284}, // k=136 - {-0.9757021300385286, -0.2191012401568698}, // k=137 - {-0.9700312531945440, -0.2429801799032638}, // k=138 - {-0.9637760657954400, -0.2667127574748983}, // k=139 - {-0.9569403357322089, -0.2902846772544621}, // k=140 - {-0.9495281805930368, -0.3136817403988912}, // k=141 - {-0.9415440651830208, -0.3368898533922201}, // k=142 - {-0.9329927988347390, -0.3598950365349881}, // k=143 - {-0.9238795325112868, -0.3826834323650897}, // k=144 - {-0.9142097557035307, -0.4052413140049897}, // k=145 - {-0.9039892931234434, -0.4275550934302818}, // k=146 - {-0.8932243011955153, -0.4496113296546067}, // k=147 - {-0.8819212643483550, -0.4713967368259976}, // k=148 - {-0.8700869911087115, -0.4928981922297839}, // k=149 - {-0.8577286100002721, -0.5141027441932216}, // k=150 - {-0.8448535652497072, -0.5349976198870969}, // k=151 - {-0.8314696123025455, -0.5555702330196020}, // k=152 - {-0.8175848131515837, -0.5758081914178453}, // k=153 - {-0.8032075314806449, -0.5956993044924332}, // k=154 - {-0.7883464276266063, -0.6152315905806267}, // k=155 - {-0.7730104533627371, -0.6343932841636453}, // k=156 - {-0.7572088465064848, -0.6531728429537765}, // k=157 - {-0.7409511253549591, -0.6715589548470184}, // k=158 - {-0.7242470829514670, -0.6895405447370668}, // k=159 - {-0.7071067811865477, -0.7071067811865475}, // k=160 - {-0.6895405447370671, -0.7242470829514668}, // k=161 - {-0.6715589548470187, -0.7409511253549589}, // k=162 - {-0.6531728429537771, -0.7572088465064842}, // k=163 - {-0.6343932841636459, -0.7730104533627367}, // k=164 - {-0.6152315905806273, -0.7883464276266059}, // k=165 - {-0.5956993044924331, -0.8032075314806451}, // k=166 - {-0.5758081914178452, -0.8175848131515838}, // k=167 - {-0.5555702330196022, -0.8314696123025452}, // k=168 - {-0.5349976198870973, -0.8448535652497070}, // k=169 - {-0.5141027441932218, -0.8577286100002720}, // k=170 - {-0.4928981922297842, -0.8700869911087113}, // k=171 - {-0.4713967368259979, -0.8819212643483549}, // k=172 - {-0.4496113296546069, -0.8932243011955152}, // k=173 - {-0.4275550934302825, -0.9039892931234431}, // k=174 - {-0.4052413140049904, -0.9142097557035305}, // k=175 - {-0.3826834323650903, -0.9238795325112865}, // k=176 - {-0.3598950365349879, -0.9329927988347390}, // k=177 - {-0.3368898533922199, -0.9415440651830208}, // k=178 - {-0.3136817403988915, -0.9495281805930367}, // k=179 - {-0.2902846772544624, -0.9569403357322088}, // k=180 - {-0.2667127574748985, -0.9637760657954398}, // k=181 - {-0.2429801799032641, -0.9700312531945440}, // k=182 - {-0.2191012401568701, -0.9757021300385285}, // k=183 - {-0.1950903220161287, -0.9807852804032303}, // k=184 - {-0.1709618887603017, -0.9852776423889411}, // k=185 - {-0.1467304744553623, -0.9891765099647809}, // k=186 - {-0.1224106751992160, -0.9924795345987101}, // k=187 - {-0.0980171403295605, -0.9951847266721969}, // k=188 - {-0.0735645635996674, -0.9972904566786902}, // k=189 - {-0.0490676743274180, -0.9987954562051724}, // k=190 - {-0.0245412285229124, -0.9996988186962042}, // k=191 - {-0.0000000000000002, -1.0000000000000000}, // k=192 - {0.0245412285229120, -0.9996988186962042}, // k=193 - {0.0490676743274177, -0.9987954562051724}, // k=194 - {0.0735645635996670, -0.9972904566786902}, // k=195 - {0.0980171403295601, -0.9951847266721969}, // k=196 - {0.1224106751992156, -0.9924795345987101}, // k=197 - {0.1467304744553619, -0.9891765099647809}, // k=198 - {0.1709618887603013, -0.9852776423889412}, // k=199 - {0.1950903220161283, -0.9807852804032304}, // k=200 - {0.2191012401568697, -0.9757021300385286}, // k=201 - {0.2429801799032638, -0.9700312531945440}, // k=202 - {0.2667127574748982, -0.9637760657954400}, // k=203 - {0.2902846772544621, -0.9569403357322089}, // k=204 - {0.3136817403988911, -0.9495281805930368}, // k=205 - {0.3368898533922196, -0.9415440651830209}, // k=206 - {0.3598950365349876, -0.9329927988347391}, // k=207 - {0.3826834323650900, -0.9238795325112866}, // k=208 - {0.4052413140049900, -0.9142097557035306}, // k=209 - {0.4275550934302821, -0.9039892931234433}, // k=210 - {0.4496113296546066, -0.8932243011955153}, // k=211 - {0.4713967368259976, -0.8819212643483550}, // k=212 - {0.4928981922297839, -0.8700869911087115}, // k=213 - {0.5141027441932216, -0.8577286100002722}, // k=214 - {0.5349976198870969, -0.8448535652497072}, // k=215 - {0.5555702330196018, -0.8314696123025455}, // k=216 - {0.5758081914178449, -0.8175848131515840}, // k=217 - {0.5956993044924329, -0.8032075314806453}, // k=218 - {0.6152315905806270, -0.7883464276266061}, // k=219 - {0.6343932841636456, -0.7730104533627369}, // k=220 - {0.6531728429537768, -0.7572088465064846}, // k=221 - {0.6715589548470183, -0.7409511253549591}, // k=222 - {0.6895405447370668, -0.7242470829514670}, // k=223 - {0.7071067811865474, -0.7071067811865477}, // k=224 - {0.7242470829514667, -0.6895405447370672}, // k=225 - {0.7409511253549589, -0.6715589548470187}, // k=226 - {0.7572088465064842, -0.6531728429537771}, // k=227 - {0.7730104533627367, -0.6343932841636459}, // k=228 - {0.7883464276266059, -0.6152315905806274}, // k=229 - {0.8032075314806451, -0.5956993044924332}, // k=230 - {0.8175848131515837, -0.5758081914178452}, // k=231 - {0.8314696123025452, -0.5555702330196022}, // k=232 - {0.8448535652497070, -0.5349976198870973}, // k=233 - {0.8577286100002720, -0.5141027441932219}, // k=234 - {0.8700869911087113, -0.4928981922297843}, // k=235 - {0.8819212643483548, -0.4713967368259979}, // k=236 - {0.8932243011955151, -0.4496113296546070}, // k=237 - {0.9039892931234431, -0.4275550934302825}, // k=238 - {0.9142097557035305, -0.4052413140049904}, // k=239 - {0.9238795325112865, -0.3826834323650904}, // k=240 - {0.9329927988347390, -0.3598950365349880}, // k=241 - {0.9415440651830208, -0.3368898533922200}, // k=242 - {0.9495281805930367, -0.3136817403988915}, // k=243 - {0.9569403357322088, -0.2902846772544625}, // k=244 - {0.9637760657954398, -0.2667127574748986}, // k=245 - {0.9700312531945440, -0.2429801799032642}, // k=246 - {0.9757021300385285, -0.2191012401568702}, // k=247 - {0.9807852804032303, -0.1950903220161287}, // k=248 - {0.9852776423889411, -0.1709618887603018}, // k=249 - {0.9891765099647809, -0.1467304744553624}, // k=250 - {0.9924795345987100, -0.1224106751992160}, // k=251 - {0.9951847266721969, -0.0980171403295605}, // k=252 - {0.9972904566786902, -0.0735645635996674}, // k=253 - {0.9987954562051724, -0.0490676743274181}, // k=254 - {0.9996988186962042, -0.0245412285229124}, // k=255 -}; -__constant__ double2 c_twiddle_fwd_512[512] = { +__constant__ double2 c_twiddle_512[512] = { {1.0000000000000000, -0.0000000000000000}, // k=0 {0.9999247018391445, -0.0122715382857199}, // k=1 {0.9996988186962042, -0.0245412285229123}, // k=2 @@ -1529,518 +1023,3 @@ __constant__ double2 c_twiddle_fwd_512[512] = { {0.9996988186962042, 0.0245412285229124}, // k=510 {0.9999247018391445, 0.0122715382857206}, // k=511 }; - -__constant__ double2 c_twiddle_inv_512[512] = { - {1.0000000000000000, 0.0000000000000000}, // k=0 - {0.9999247018391445, 0.0122715382857199}, // k=1 - {0.9996988186962042, 0.0245412285229123}, // k=2 - {0.9993223845883495, 0.0368072229413588}, // k=3 - {0.9987954562051724, 0.0490676743274180}, // k=4 - {0.9981181129001492, 0.0613207363022086}, // k=5 - {0.9972904566786902, 0.0735645635996674}, // k=6 - {0.9963126121827780, 0.0857973123444399}, // k=7 - {0.9951847266721969, 0.0980171403295606}, // k=8 - {0.9939069700023561, 0.1102222072938831}, // k=9 - {0.9924795345987100, 0.1224106751992162}, // k=10 - {0.9909026354277800, 0.1345807085071262}, // k=11 - {0.9891765099647810, 0.1467304744553617}, // k=12 - {0.9873014181578584, 0.1588581433338614}, // k=13 - {0.9852776423889412, 0.1709618887603012}, // k=14 - {0.9831054874312163, 0.1830398879551410}, // k=15 - {0.9807852804032304, 0.1950903220161282}, // k=16 - {0.9783173707196277, 0.2071113761922186}, // k=17 - {0.9757021300385286, 0.2191012401568698}, // k=18 - {0.9729399522055602, 0.2310581082806711}, // k=19 - {0.9700312531945440, 0.2429801799032639}, // k=20 - {0.9669764710448521, 0.2548656596045146}, // k=21 - {0.9637760657954398, 0.2667127574748984}, // k=22 - {0.9604305194155658, 0.2785196893850531}, // k=23 - {0.9569403357322088, 0.2902846772544623}, // k=24 - {0.9533060403541939, 0.3020059493192281}, // k=25 - {0.9495281805930367, 0.3136817403988915}, // k=26 - {0.9456073253805213, 0.3253102921622629}, // k=27 - {0.9415440651830208, 0.3368898533922201}, // k=28 - {0.9373390119125750, 0.3484186802494346}, // k=29 - {0.9329927988347390, 0.3598950365349881}, // k=30 - {0.9285060804732156, 0.3713171939518375}, // k=31 - {0.9238795325112867, 0.3826834323650898}, // k=32 - {0.9191138516900578, 0.3939920400610481}, // k=33 - {0.9142097557035307, 0.4052413140049899}, // k=34 - {0.9091679830905224, 0.4164295600976372}, // k=35 - {0.9039892931234433, 0.4275550934302821}, // k=36 - {0.8986744656939538, 0.4386162385385277}, // k=37 - {0.8932243011955153, 0.4496113296546065}, // k=38 - {0.8876396204028539, 0.4605387109582400}, // k=39 - {0.8819212643483550, 0.4713967368259976}, // k=40 - {0.8760700941954066, 0.4821837720791227}, // k=41 - {0.8700869911087115, 0.4928981922297840}, // k=42 - {0.8639728561215868, 0.5035383837257176}, // k=43 - {0.8577286100002721, 0.5141027441932217}, // k=44 - {0.8513551931052652, 0.5245896826784689}, // k=45 - {0.8448535652497071, 0.5349976198870972}, // k=46 - {0.8382247055548381, 0.5453249884220465}, // k=47 - {0.8314696123025452, 0.5555702330196022}, // k=48 - {0.8245893027850253, 0.5657318107836131}, // k=49 - {0.8175848131515837, 0.5758081914178453}, // k=50 - {0.8104571982525948, 0.5857978574564389}, // k=51 - {0.8032075314806449, 0.5956993044924334}, // k=52 - {0.7958369046088836, 0.6055110414043255}, // k=53 - {0.7883464276266063, 0.6152315905806268}, // k=54 - {0.7807372285720945, 0.6248594881423863}, // k=55 - {0.7730104533627370, 0.6343932841636455}, // k=56 - {0.7651672656224590, 0.6438315428897914}, // k=57 - {0.7572088465064846, 0.6531728429537768}, // k=58 - {0.7491363945234594, 0.6624157775901718}, // k=59 - {0.7409511253549591, 0.6715589548470183}, // k=60 - {0.7326542716724128, 0.6806009977954530}, // k=61 - {0.7242470829514670, 0.6895405447370668}, // k=62 - {0.7157308252838186, 0.6983762494089729}, // k=63 - {0.7071067811865476, 0.7071067811865475}, // k=64 - {0.6983762494089729, 0.7157308252838186}, // k=65 - {0.6895405447370669, 0.7242470829514669}, // k=66 - {0.6806009977954531, 0.7326542716724128}, // k=67 - {0.6715589548470183, 0.7409511253549591}, // k=68 - {0.6624157775901718, 0.7491363945234593}, // k=69 - {0.6531728429537768, 0.7572088465064845}, // k=70 - {0.6438315428897915, 0.7651672656224590}, // k=71 - {0.6343932841636455, 0.7730104533627370}, // k=72 - {0.6248594881423865, 0.7807372285720944}, // k=73 - {0.6152315905806268, 0.7883464276266062}, // k=74 - {0.6055110414043255, 0.7958369046088835}, // k=75 - {0.5956993044924335, 0.8032075314806448}, // k=76 - {0.5857978574564389, 0.8104571982525948}, // k=77 - {0.5758081914178453, 0.8175848131515837}, // k=78 - {0.5657318107836132, 0.8245893027850253}, // k=79 - {0.5555702330196023, 0.8314696123025452}, // k=80 - {0.5453249884220465, 0.8382247055548380}, // k=81 - {0.5349976198870973, 0.8448535652497070}, // k=82 - {0.5245896826784688, 0.8513551931052652}, // k=83 - {0.5141027441932217, 0.8577286100002721}, // k=84 - {0.5035383837257176, 0.8639728561215867}, // k=85 - {0.4928981922297841, 0.8700869911087113}, // k=86 - {0.4821837720791228, 0.8760700941954066}, // k=87 - {0.4713967368259978, 0.8819212643483549}, // k=88 - {0.4605387109582400, 0.8876396204028539}, // k=89 - {0.4496113296546066, 0.8932243011955153}, // k=90 - {0.4386162385385277, 0.8986744656939538}, // k=91 - {0.4275550934302822, 0.9039892931234433}, // k=92 - {0.4164295600976373, 0.9091679830905223}, // k=93 - {0.4052413140049899, 0.9142097557035307}, // k=94 - {0.3939920400610481, 0.9191138516900578}, // k=95 - {0.3826834323650898, 0.9238795325112867}, // k=96 - {0.3713171939518376, 0.9285060804732155}, // k=97 - {0.3598950365349883, 0.9329927988347388}, // k=98 - {0.3484186802494345, 0.9373390119125750}, // k=99 - {0.3368898533922201, 0.9415440651830208}, // k=100 - {0.3253102921622630, 0.9456073253805213}, // k=101 - {0.3136817403988916, 0.9495281805930367}, // k=102 - {0.3020059493192282, 0.9533060403541938}, // k=103 - {0.2902846772544623, 0.9569403357322089}, // k=104 - {0.2785196893850531, 0.9604305194155658}, // k=105 - {0.2667127574748984, 0.9637760657954398}, // k=106 - {0.2548656596045146, 0.9669764710448521}, // k=107 - {0.2429801799032640, 0.9700312531945440}, // k=108 - {0.2310581082806713, 0.9729399522055601}, // k=109 - {0.2191012401568698, 0.9757021300385286}, // k=110 - {0.2071113761922186, 0.9783173707196277}, // k=111 - {0.1950903220161283, 0.9807852804032304}, // k=112 - {0.1830398879551411, 0.9831054874312163}, // k=113 - {0.1709618887603014, 0.9852776423889412}, // k=114 - {0.1588581433338614, 0.9873014181578584}, // k=115 - {0.1467304744553617, 0.9891765099647810}, // k=116 - {0.1345807085071262, 0.9909026354277800}, // k=117 - {0.1224106751992163, 0.9924795345987100}, // k=118 - {0.1102222072938832, 0.9939069700023561}, // k=119 - {0.0980171403295608, 0.9951847266721968}, // k=120 - {0.0857973123444399, 0.9963126121827780}, // k=121 - {0.0735645635996675, 0.9972904566786902}, // k=122 - {0.0613207363022086, 0.9981181129001492}, // k=123 - {0.0490676743274181, 0.9987954562051724}, // k=124 - {0.0368072229413590, 0.9993223845883495}, // k=125 - {0.0245412285229123, 0.9996988186962042}, // k=126 - {0.0122715382857199, 0.9999247018391445}, // k=127 - {0.0000000000000001, 1.0000000000000000}, // k=128 - {-0.0122715382857198, 0.9999247018391445}, // k=129 - {-0.0245412285229121, 0.9996988186962042}, // k=130 - {-0.0368072229413589, 0.9993223845883495}, // k=131 - {-0.0490676743274180, 0.9987954562051724}, // k=132 - {-0.0613207363022085, 0.9981181129001492}, // k=133 - {-0.0735645635996673, 0.9972904566786902}, // k=134 - {-0.0857973123444398, 0.9963126121827780}, // k=135 - {-0.0980171403295606, 0.9951847266721969}, // k=136 - {-0.1102222072938831, 0.9939069700023561}, // k=137 - {-0.1224106751992162, 0.9924795345987100}, // k=138 - {-0.1345807085071261, 0.9909026354277800}, // k=139 - {-0.1467304744553616, 0.9891765099647810}, // k=140 - {-0.1588581433338613, 0.9873014181578584}, // k=141 - {-0.1709618887603012, 0.9852776423889412}, // k=142 - {-0.1830398879551409, 0.9831054874312163}, // k=143 - {-0.1950903220161282, 0.9807852804032304}, // k=144 - {-0.2071113761922184, 0.9783173707196277}, // k=145 - {-0.2191012401568697, 0.9757021300385286}, // k=146 - {-0.2310581082806711, 0.9729399522055602}, // k=147 - {-0.2429801799032639, 0.9700312531945440}, // k=148 - {-0.2548656596045145, 0.9669764710448521}, // k=149 - {-0.2667127574748983, 0.9637760657954398}, // k=150 - {-0.2785196893850529, 0.9604305194155659}, // k=151 - {-0.2902846772544622, 0.9569403357322089}, // k=152 - {-0.3020059493192281, 0.9533060403541939}, // k=153 - {-0.3136817403988914, 0.9495281805930367}, // k=154 - {-0.3253102921622629, 0.9456073253805214}, // k=155 - {-0.3368898533922199, 0.9415440651830208}, // k=156 - {-0.3484186802494344, 0.9373390119125750}, // k=157 - {-0.3598950365349882, 0.9329927988347388}, // k=158 - {-0.3713171939518375, 0.9285060804732156}, // k=159 - {-0.3826834323650897, 0.9238795325112867}, // k=160 - {-0.3939920400610480, 0.9191138516900578}, // k=161 - {-0.4052413140049897, 0.9142097557035307}, // k=162 - {-0.4164295600976370, 0.9091679830905225}, // k=163 - {-0.4275550934302819, 0.9039892931234434}, // k=164 - {-0.4386162385385274, 0.8986744656939539}, // k=165 - {-0.4496113296546067, 0.8932243011955152}, // k=166 - {-0.4605387109582401, 0.8876396204028539}, // k=167 - {-0.4713967368259977, 0.8819212643483550}, // k=168 - {-0.4821837720791227, 0.8760700941954066}, // k=169 - {-0.4928981922297840, 0.8700869911087115}, // k=170 - {-0.5035383837257175, 0.8639728561215868}, // k=171 - {-0.5141027441932217, 0.8577286100002721}, // k=172 - {-0.5245896826784687, 0.8513551931052652}, // k=173 - {-0.5349976198870970, 0.8448535652497072}, // k=174 - {-0.5453249884220462, 0.8382247055548382}, // k=175 - {-0.5555702330196020, 0.8314696123025455}, // k=176 - {-0.5657318107836132, 0.8245893027850252}, // k=177 - {-0.5758081914178453, 0.8175848131515837}, // k=178 - {-0.5857978574564389, 0.8104571982525948}, // k=179 - {-0.5956993044924334, 0.8032075314806449}, // k=180 - {-0.6055110414043254, 0.7958369046088836}, // k=181 - {-0.6152315905806267, 0.7883464276266063}, // k=182 - {-0.6248594881423862, 0.7807372285720946}, // k=183 - {-0.6343932841636454, 0.7730104533627371}, // k=184 - {-0.6438315428897913, 0.7651672656224591}, // k=185 - {-0.6531728429537765, 0.7572088465064847}, // k=186 - {-0.6624157775901719, 0.7491363945234593}, // k=187 - {-0.6715589548470184, 0.7409511253549590}, // k=188 - {-0.6806009977954530, 0.7326542716724128}, // k=189 - {-0.6895405447370669, 0.7242470829514669}, // k=190 - {-0.6983762494089728, 0.7157308252838187}, // k=191 - {-0.7071067811865475, 0.7071067811865476}, // k=192 - {-0.7157308252838186, 0.6983762494089729}, // k=193 - {-0.7242470829514668, 0.6895405447370671}, // k=194 - {-0.7326542716724127, 0.6806009977954532}, // k=195 - {-0.7409511253549589, 0.6715589548470186}, // k=196 - {-0.7491363945234591, 0.6624157775901720}, // k=197 - {-0.7572088465064846, 0.6531728429537766}, // k=198 - {-0.7651672656224590, 0.6438315428897914}, // k=199 - {-0.7730104533627370, 0.6343932841636455}, // k=200 - {-0.7807372285720945, 0.6248594881423863}, // k=201 - {-0.7883464276266062, 0.6152315905806269}, // k=202 - {-0.7958369046088835, 0.6055110414043257}, // k=203 - {-0.8032075314806448, 0.5956993044924335}, // k=204 - {-0.8104571982525947, 0.5857978574564390}, // k=205 - {-0.8175848131515836, 0.5758081914178454}, // k=206 - {-0.8245893027850251, 0.5657318107836135}, // k=207 - {-0.8314696123025453, 0.5555702330196022}, // k=208 - {-0.8382247055548381, 0.5453249884220464}, // k=209 - {-0.8448535652497071, 0.5349976198870972}, // k=210 - {-0.8513551931052652, 0.5245896826784689}, // k=211 - {-0.8577286100002720, 0.5141027441932218}, // k=212 - {-0.8639728561215867, 0.5035383837257177}, // k=213 - {-0.8700869911087113, 0.4928981922297841}, // k=214 - {-0.8760700941954065, 0.4821837720791229}, // k=215 - {-0.8819212643483549, 0.4713967368259979}, // k=216 - {-0.8876396204028538, 0.4605387109582402}, // k=217 - {-0.8932243011955152, 0.4496113296546069}, // k=218 - {-0.8986744656939539, 0.4386162385385275}, // k=219 - {-0.9039892931234433, 0.4275550934302820}, // k=220 - {-0.9091679830905224, 0.4164295600976372}, // k=221 - {-0.9142097557035307, 0.4052413140049899}, // k=222 - {-0.9191138516900578, 0.3939920400610482}, // k=223 - {-0.9238795325112867, 0.3826834323650899}, // k=224 - {-0.9285060804732155, 0.3713171939518377}, // k=225 - {-0.9329927988347388, 0.3598950365349883}, // k=226 - {-0.9373390119125748, 0.3484186802494348}, // k=227 - {-0.9415440651830207, 0.3368898533922203}, // k=228 - {-0.9456073253805212, 0.3253102921622633}, // k=229 - {-0.9495281805930367, 0.3136817403988914}, // k=230 - {-0.9533060403541939, 0.3020059493192280}, // k=231 - {-0.9569403357322088, 0.2902846772544624}, // k=232 - {-0.9604305194155658, 0.2785196893850532}, // k=233 - {-0.9637760657954398, 0.2667127574748985}, // k=234 - {-0.9669764710448521, 0.2548656596045147}, // k=235 - {-0.9700312531945440, 0.2429801799032641}, // k=236 - {-0.9729399522055601, 0.2310581082806713}, // k=237 - {-0.9757021300385285, 0.2191012401568700}, // k=238 - {-0.9783173707196275, 0.2071113761922188}, // k=239 - {-0.9807852804032304, 0.1950903220161286}, // k=240 - {-0.9831054874312163, 0.1830398879551409}, // k=241 - {-0.9852776423889412, 0.1709618887603012}, // k=242 - {-0.9873014181578584, 0.1588581433338615}, // k=243 - {-0.9891765099647810, 0.1467304744553618}, // k=244 - {-0.9909026354277800, 0.1345807085071263}, // k=245 - {-0.9924795345987100, 0.1224106751992163}, // k=246 - {-0.9939069700023561, 0.1102222072938832}, // k=247 - {-0.9951847266721968, 0.0980171403295608}, // k=248 - {-0.9963126121827780, 0.0857973123444402}, // k=249 - {-0.9972904566786902, 0.0735645635996677}, // k=250 - {-0.9981181129001492, 0.0613207363022085}, // k=251 - {-0.9987954562051724, 0.0490676743274180}, // k=252 - {-0.9993223845883495, 0.0368072229413588}, // k=253 - {-0.9996988186962042, 0.0245412285229123}, // k=254 - {-0.9999247018391445, 0.0122715382857200}, // k=255 - {-1.0000000000000000, 0.0000000000000001}, // k=256 - {-0.9999247018391445, -0.0122715382857198}, // k=257 - {-0.9996988186962042, -0.0245412285229121}, // k=258 - {-0.9993223845883495, -0.0368072229413586}, // k=259 - {-0.9987954562051724, -0.0490676743274177}, // k=260 - {-0.9981181129001492, -0.0613207363022082}, // k=261 - {-0.9972904566786902, -0.0735645635996675}, // k=262 - {-0.9963126121827780, -0.0857973123444399}, // k=263 - {-0.9951847266721969, -0.0980171403295606}, // k=264 - {-0.9939069700023561, -0.1102222072938830}, // k=265 - {-0.9924795345987100, -0.1224106751992161}, // k=266 - {-0.9909026354277800, -0.1345807085071261}, // k=267 - {-0.9891765099647810, -0.1467304744553616}, // k=268 - {-0.9873014181578584, -0.1588581433338612}, // k=269 - {-0.9852776423889413, -0.1709618887603010}, // k=270 - {-0.9831054874312164, -0.1830398879551406}, // k=271 - {-0.9807852804032304, -0.1950903220161284}, // k=272 - {-0.9783173707196277, -0.2071113761922186}, // k=273 - {-0.9757021300385286, -0.2191012401568698}, // k=274 - {-0.9729399522055602, -0.2310581082806711}, // k=275 - {-0.9700312531945440, -0.2429801799032638}, // k=276 - {-0.9669764710448522, -0.2548656596045145}, // k=277 - {-0.9637760657954400, -0.2667127574748983}, // k=278 - {-0.9604305194155659, -0.2785196893850529}, // k=279 - {-0.9569403357322089, -0.2902846772544621}, // k=280 - {-0.9533060403541940, -0.3020059493192278}, // k=281 - {-0.9495281805930368, -0.3136817403988912}, // k=282 - {-0.9456073253805213, -0.3253102921622630}, // k=283 - {-0.9415440651830208, -0.3368898533922201}, // k=284 - {-0.9373390119125750, -0.3484186802494346}, // k=285 - {-0.9329927988347390, -0.3598950365349881}, // k=286 - {-0.9285060804732156, -0.3713171939518374}, // k=287 - {-0.9238795325112868, -0.3826834323650897}, // k=288 - {-0.9191138516900578, -0.3939920400610479}, // k=289 - {-0.9142097557035307, -0.4052413140049897}, // k=290 - {-0.9091679830905225, -0.4164295600976369}, // k=291 - {-0.9039892931234434, -0.4275550934302818}, // k=292 - {-0.8986744656939540, -0.4386162385385273}, // k=293 - {-0.8932243011955153, -0.4496113296546067}, // k=294 - {-0.8876396204028539, -0.4605387109582401}, // k=295 - {-0.8819212643483550, -0.4713967368259976}, // k=296 - {-0.8760700941954066, -0.4821837720791227}, // k=297 - {-0.8700869911087115, -0.4928981922297839}, // k=298 - {-0.8639728561215868, -0.5035383837257175}, // k=299 - {-0.8577286100002721, -0.5141027441932216}, // k=300 - {-0.8513551931052653, -0.5245896826784687}, // k=301 - {-0.8448535652497072, -0.5349976198870969}, // k=302 - {-0.8382247055548382, -0.5453249884220461}, // k=303 - {-0.8314696123025455, -0.5555702330196020}, // k=304 - {-0.8245893027850253, -0.5657318107836132}, // k=305 - {-0.8175848131515837, -0.5758081914178453}, // k=306 - {-0.8104571982525948, -0.5857978574564389}, // k=307 - {-0.8032075314806449, -0.5956993044924332}, // k=308 - {-0.7958369046088836, -0.6055110414043254}, // k=309 - {-0.7883464276266063, -0.6152315905806267}, // k=310 - {-0.7807372285720946, -0.6248594881423862}, // k=311 - {-0.7730104533627371, -0.6343932841636453}, // k=312 - {-0.7651672656224591, -0.6438315428897913}, // k=313 - {-0.7572088465064848, -0.6531728429537765}, // k=314 - {-0.7491363945234593, -0.6624157775901718}, // k=315 - {-0.7409511253549591, -0.6715589548470184}, // k=316 - {-0.7326542716724128, -0.6806009977954530}, // k=317 - {-0.7242470829514670, -0.6895405447370668}, // k=318 - {-0.7157308252838187, -0.6983762494089728}, // k=319 - {-0.7071067811865477, -0.7071067811865475}, // k=320 - {-0.6983762494089730, -0.7157308252838185}, // k=321 - {-0.6895405447370671, -0.7242470829514668}, // k=322 - {-0.6806009977954532, -0.7326542716724126}, // k=323 - {-0.6715589548470187, -0.7409511253549589}, // k=324 - {-0.6624157775901720, -0.7491363945234590}, // k=325 - {-0.6531728429537771, -0.7572088465064842}, // k=326 - {-0.6438315428897915, -0.7651672656224590}, // k=327 - {-0.6343932841636459, -0.7730104533627367}, // k=328 - {-0.6248594881423865, -0.7807372285720944}, // k=329 - {-0.6152315905806273, -0.7883464276266059}, // k=330 - {-0.6055110414043257, -0.7958369046088835}, // k=331 - {-0.5956993044924331, -0.8032075314806451}, // k=332 - {-0.5857978574564391, -0.8104571982525947}, // k=333 - {-0.5758081914178452, -0.8175848131515838}, // k=334 - {-0.5657318107836135, -0.8245893027850251}, // k=335 - {-0.5555702330196022, -0.8314696123025452}, // k=336 - {-0.5453249884220468, -0.8382247055548379}, // k=337 - {-0.5349976198870973, -0.8448535652497070}, // k=338 - {-0.5245896826784694, -0.8513551931052649}, // k=339 - {-0.5141027441932218, -0.8577286100002720}, // k=340 - {-0.5035383837257180, -0.8639728561215865}, // k=341 - {-0.4928981922297842, -0.8700869911087113}, // k=342 - {-0.4821837720791226, -0.8760700941954067}, // k=343 - {-0.4713967368259979, -0.8819212643483549}, // k=344 - {-0.4605387109582399, -0.8876396204028540}, // k=345 - {-0.4496113296546069, -0.8932243011955152}, // k=346 - {-0.4386162385385276, -0.8986744656939538}, // k=347 - {-0.4275550934302825, -0.9039892931234431}, // k=348 - {-0.4164295600976372, -0.9091679830905224}, // k=349 - {-0.4052413140049904, -0.9142097557035305}, // k=350 - {-0.3939920400610482, -0.9191138516900577}, // k=351 - {-0.3826834323650903, -0.9238795325112865}, // k=352 - {-0.3713171939518378, -0.9285060804732155}, // k=353 - {-0.3598950365349879, -0.9329927988347390}, // k=354 - {-0.3484186802494348, -0.9373390119125748}, // k=355 - {-0.3368898533922199, -0.9415440651830208}, // k=356 - {-0.3253102921622633, -0.9456073253805212}, // k=357 - {-0.3136817403988915, -0.9495281805930367}, // k=358 - {-0.3020059493192285, -0.9533060403541938}, // k=359 - {-0.2902846772544624, -0.9569403357322088}, // k=360 - {-0.2785196893850536, -0.9604305194155657}, // k=361 - {-0.2667127574748985, -0.9637760657954398}, // k=362 - {-0.2548656596045143, -0.9669764710448522}, // k=363 - {-0.2429801799032641, -0.9700312531945440}, // k=364 - {-0.2310581082806709, -0.9729399522055602}, // k=365 - {-0.2191012401568701, -0.9757021300385285}, // k=366 - {-0.2071113761922185, -0.9783173707196277}, // k=367 - {-0.1950903220161287, -0.9807852804032303}, // k=368 - {-0.1830398879551410, -0.9831054874312163}, // k=369 - {-0.1709618887603017, -0.9852776423889411}, // k=370 - {-0.1588581433338615, -0.9873014181578583}, // k=371 - {-0.1467304744553623, -0.9891765099647809}, // k=372 - {-0.1345807085071264, -0.9909026354277800}, // k=373 - {-0.1224106751992160, -0.9924795345987101}, // k=374 - {-0.1102222072938833, -0.9939069700023561}, // k=375 - {-0.0980171403295605, -0.9951847266721969}, // k=376 - {-0.0857973123444402, -0.9963126121827780}, // k=377 - {-0.0735645635996674, -0.9972904566786902}, // k=378 - {-0.0613207363022090, -0.9981181129001492}, // k=379 - {-0.0490676743274180, -0.9987954562051724}, // k=380 - {-0.0368072229413593, -0.9993223845883494}, // k=381 - {-0.0245412285229124, -0.9996988186962042}, // k=382 - {-0.0122715382857205, -0.9999247018391445}, // k=383 - {-0.0000000000000002, -1.0000000000000000}, // k=384 - {0.0122715382857201, -0.9999247018391445}, // k=385 - {0.0245412285229120, -0.9996988186962042}, // k=386 - {0.0368072229413590, -0.9993223845883495}, // k=387 - {0.0490676743274177, -0.9987954562051724}, // k=388 - {0.0613207363022086, -0.9981181129001492}, // k=389 - {0.0735645635996670, -0.9972904566786902}, // k=390 - {0.0857973123444399, -0.9963126121827780}, // k=391 - {0.0980171403295601, -0.9951847266721969}, // k=392 - {0.1102222072938829, -0.9939069700023561}, // k=393 - {0.1224106751992156, -0.9924795345987101}, // k=394 - {0.1345807085071260, -0.9909026354277800}, // k=395 - {0.1467304744553619, -0.9891765099647809}, // k=396 - {0.1588581433338612, -0.9873014181578584}, // k=397 - {0.1709618887603013, -0.9852776423889412}, // k=398 - {0.1830398879551406, -0.9831054874312164}, // k=399 - {0.1950903220161283, -0.9807852804032304}, // k=400 - {0.2071113761922181, -0.9783173707196278}, // k=401 - {0.2191012401568697, -0.9757021300385286}, // k=402 - {0.2310581082806706, -0.9729399522055603}, // k=403 - {0.2429801799032638, -0.9700312531945440}, // k=404 - {0.2548656596045140, -0.9669764710448523}, // k=405 - {0.2667127574748982, -0.9637760657954400}, // k=406 - {0.2785196893850533, -0.9604305194155658}, // k=407 - {0.2902846772544621, -0.9569403357322089}, // k=408 - {0.3020059493192281, -0.9533060403541939}, // k=409 - {0.3136817403988911, -0.9495281805930368}, // k=410 - {0.3253102921622629, -0.9456073253805213}, // k=411 - {0.3368898533922196, -0.9415440651830209}, // k=412 - {0.3484186802494345, -0.9373390119125750}, // k=413 - {0.3598950365349876, -0.9329927988347391}, // k=414 - {0.3713171939518374, -0.9285060804732156}, // k=415 - {0.3826834323650900, -0.9238795325112866}, // k=416 - {0.3939920400610479, -0.9191138516900579}, // k=417 - {0.4052413140049900, -0.9142097557035306}, // k=418 - {0.4164295600976369, -0.9091679830905225}, // k=419 - {0.4275550934302821, -0.9039892931234433}, // k=420 - {0.4386162385385273, -0.8986744656939540}, // k=421 - {0.4496113296546066, -0.8932243011955153}, // k=422 - {0.4605387109582396, -0.8876396204028542}, // k=423 - {0.4713967368259976, -0.8819212643483550}, // k=424 - {0.4821837720791222, -0.8760700941954069}, // k=425 - {0.4928981922297839, -0.8700869911087115}, // k=426 - {0.5035383837257178, -0.8639728561215866}, // k=427 - {0.5141027441932216, -0.8577286100002722}, // k=428 - {0.5245896826784691, -0.8513551931052651}, // k=429 - {0.5349976198870969, -0.8448535652497072}, // k=430 - {0.5453249884220465, -0.8382247055548380}, // k=431 - {0.5555702330196018, -0.8314696123025455}, // k=432 - {0.5657318107836131, -0.8245893027850253}, // k=433 - {0.5758081914178449, -0.8175848131515840}, // k=434 - {0.5857978574564388, -0.8104571982525949}, // k=435 - {0.5956993044924329, -0.8032075314806453}, // k=436 - {0.6055110414043253, -0.7958369046088837}, // k=437 - {0.6152315905806270, -0.7883464276266061}, // k=438 - {0.6248594881423861, -0.7807372285720946}, // k=439 - {0.6343932841636456, -0.7730104533627369}, // k=440 - {0.6438315428897912, -0.7651672656224592}, // k=441 - {0.6531728429537768, -0.7572088465064846}, // k=442 - {0.6624157775901715, -0.7491363945234596}, // k=443 - {0.6715589548470183, -0.7409511253549591}, // k=444 - {0.6806009977954527, -0.7326542716724131}, // k=445 - {0.6895405447370668, -0.7242470829514670}, // k=446 - {0.6983762494089724, -0.7157308252838190}, // k=447 - {0.7071067811865474, -0.7071067811865477}, // k=448 - {0.7157308252838188, -0.6983762494089727}, // k=449 - {0.7242470829514667, -0.6895405447370672}, // k=450 - {0.7326542716724129, -0.6806009977954530}, // k=451 - {0.7409511253549589, -0.6715589548470187}, // k=452 - {0.7491363945234594, -0.6624157775901718}, // k=453 - {0.7572088465064842, -0.6531728429537771}, // k=454 - {0.7651672656224588, -0.6438315428897915}, // k=455 - {0.7730104533627367, -0.6343932841636459}, // k=456 - {0.7807372285720944, -0.6248594881423865}, // k=457 - {0.7883464276266059, -0.6152315905806274}, // k=458 - {0.7958369046088833, -0.6055110414043257}, // k=459 - {0.8032075314806451, -0.5956993044924332}, // k=460 - {0.8104571982525947, -0.5857978574564391}, // k=461 - {0.8175848131515837, -0.5758081914178452}, // k=462 - {0.8245893027850251, -0.5657318107836136}, // k=463 - {0.8314696123025452, -0.5555702330196022}, // k=464 - {0.8382247055548377, -0.5453249884220468}, // k=465 - {0.8448535652497070, -0.5349976198870973}, // k=466 - {0.8513551931052649, -0.5245896826784694}, // k=467 - {0.8577286100002720, -0.5141027441932219}, // k=468 - {0.8639728561215864, -0.5035383837257181}, // k=469 - {0.8700869911087113, -0.4928981922297843}, // k=470 - {0.8760700941954067, -0.4821837720791226}, // k=471 - {0.8819212643483548, -0.4713967368259979}, // k=472 - {0.8876396204028539, -0.4605387109582399}, // k=473 - {0.8932243011955151, -0.4496113296546070}, // k=474 - {0.8986744656939538, -0.4386162385385277}, // k=475 - {0.9039892931234431, -0.4275550934302825}, // k=476 - {0.9091679830905224, -0.4164295600976373}, // k=477 - {0.9142097557035305, -0.4052413140049904}, // k=478 - {0.9191138516900577, -0.3939920400610483}, // k=479 - {0.9238795325112865, -0.3826834323650904}, // k=480 - {0.9285060804732155, -0.3713171939518378}, // k=481 - {0.9329927988347390, -0.3598950365349880}, // k=482 - {0.9373390119125748, -0.3484186802494349}, // k=483 - {0.9415440651830208, -0.3368898533922200}, // k=484 - {0.9456073253805212, -0.3253102921622634}, // k=485 - {0.9495281805930367, -0.3136817403988915}, // k=486 - {0.9533060403541936, -0.3020059493192286}, // k=487 - {0.9569403357322088, -0.2902846772544625}, // k=488 - {0.9604305194155657, -0.2785196893850537}, // k=489 - {0.9637760657954398, -0.2667127574748986}, // k=490 - {0.9669764710448522, -0.2548656596045144}, // k=491 - {0.9700312531945440, -0.2429801799032642}, // k=492 - {0.9729399522055602, -0.2310581082806710}, // k=493 - {0.9757021300385285, -0.2191012401568702}, // k=494 - {0.9783173707196277, -0.2071113761922185}, // k=495 - {0.9807852804032303, -0.1950903220161287}, // k=496 - {0.9831054874312163, -0.1830398879551410}, // k=497 - {0.9852776423889411, -0.1709618887603018}, // k=498 - {0.9873014181578583, -0.1588581433338616}, // k=499 - {0.9891765099647809, -0.1467304744553624}, // k=500 - {0.9909026354277800, -0.1345807085071264}, // k=501 - {0.9924795345987100, -0.1224106751992160}, // k=502 - {0.9939069700023561, -0.1102222072938834}, // k=503 - {0.9951847266721969, -0.0980171403295605}, // k=504 - {0.9963126121827780, -0.0857973123444403}, // k=505 - {0.9972904566786902, -0.0735645635996674}, // k=506 - {0.9981181129001492, -0.0613207363022091}, // k=507 - {0.9987954562051724, -0.0490676743274181}, // k=508 - {0.9993223845883494, -0.0368072229413594}, // k=509 - {0.9996988186962042, -0.0245412285229124}, // k=510 - {0.9999247018391445, -0.0122715382857206}, // k=511 -}; From 932f156bfc045f80f28b2dcf72b086b45656e386 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 24 Oct 2025 09:41:22 -0700 Subject: [PATCH 24/58] Fixup: cleanup and run GPU fft on its own stream --- src/mesh/parallel/shiftedmetric.cxx | 90 +++++++++++++++-------------- 1 file changed, 46 insertions(+), 44 deletions(-) diff --git a/src/mesh/parallel/shiftedmetric.cxx b/src/mesh/parallel/shiftedmetric.cxx index 759c5aaa0a..1b92313117 100644 --- a/src/mesh/parallel/shiftedmetric.cxx +++ b/src/mesh/parallel/shiftedmetric.cxx @@ -72,7 +72,6 @@ void ShiftedMetric::cachePhases() { toAlignedPhs = Tensor(mesh.LocalNx, mesh.LocalNy, nmodes); // To/From field aligned phases - // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_ALL")) { int ix = i.x(); int iy = i.y(); @@ -111,7 +110,6 @@ void ShiftedMetric::cachePhases() { // Parallel slice phases -- note we don't shift in the boundaries/guards for (auto& slice : parallel_slice_phases) { - // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_NOY")) { int ix = i.x(); @@ -173,7 +171,6 @@ Field3D ShiftedMetric::shiftZ(const Field3D& f, const Tensor& phs, Field3D result{emptyFrom(f).setDirectionY(y_direction_out)}; - // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D(toString(region))) { shiftZ(&f(i, 0), &phs(i.x(), i.y(), 0), &result(i, 0)); } @@ -204,8 +201,7 @@ FieldPerp ShiftedMetric::shiftZ(const FieldPerp& f, const Tensor& phs, return result; } -void ShiftedMetric::shiftZ(const BoutReal* in, const dcomplex* phs, BoutReal* out, - int num_batches) const { +void ShiftedMetric::shiftZ(const BoutReal* in, const dcomplex* phs, BoutReal* out) const { #if BOUT_HAS_UMPIRE // TODO: This static keyword is a hotfix and should be removed in // future iterations. It is here because otherwise many allocations @@ -217,7 +213,7 @@ void ShiftedMetric::shiftZ(const BoutReal* in, const dcomplex* phs, BoutReal* ou #endif // Take forward FFT - rfft(in, mesh.LocalNz * num_batches, &cmplx[0]); + rfft(in, mesh.LocalNz, &cmplx[0]); // Following is an algorithm approach to write a = a*b where a and b are // vectors of dcomplex. @@ -246,13 +242,14 @@ __device__ inline unsigned int bit_reverse(unsigned int x, unsigned int log2n) { // Block-level cooperative FFT // Multiple threads cooperate on each FFT using shared memory template -__global__ void -fft_block_cooperative(const BoutReal** __restrict__ in, BoutReal** __restrict__ out, - const double2** __restrict__ blocks_phs, const int Nz_runtime, - const int nmodes, const int batches, const int nblocks) { +__global__ void fft_block_cooperative(const BoutReal** __restrict__ in, + BoutReal** __restrict__ out, + const double2** __restrict__ blocks_phs, + const int batches, const int nblocks) { constexpr int LOG2_NZ = __builtin_ctz(NZ); constexpr double INV_NZ = 1.0 / (double)NZ; + constexpr int NMODES = (NZ / 2) + 1; // Shared memory for FFTS_PER_BLOCK FFTs // Each FFT needs NZ complex values @@ -261,15 +258,15 @@ fft_block_cooperative(const BoutReal** __restrict__ in, BoutReal** __restrict__ // Select twiddles based on size const double2* twiddles; if constexpr (NZ == 16) { - twiddles = c_twiddle_fwd_16; + twiddles = c_twiddle_16; } else if constexpr (NZ == 64) { - twiddles = c_twiddle_fwd_64; + twiddles = c_twiddle_64; } else if constexpr (NZ == 128) { - twiddles = c_twiddle_fwd_128; + twiddles = c_twiddle_128; } else if constexpr (NZ == 256) { - twiddles = c_twiddle_fwd_256; + twiddles = c_twiddle_256; } else if constexpr (NZ == 512) { - twiddles = c_twiddle_fwd_512; + twiddles = c_twiddle_512; } else { static_assert(NZ == 16 || NZ == 64 || NZ == 128 || NZ == 256 || NZ == 512, "Unsupported NZ"); @@ -340,18 +337,18 @@ fft_block_cooperative(const BoutReal** __restrict__ in, BoutReal** __restrict__ } // ===== APPLY PHASE SHIFT ===== - for (int k = tid; k < nmodes; k += threads_per_fft) { - const double2 ph = phs[batch * nmodes + k]; + for (int k = tid; k < NMODES; k += threads_per_fft) { + const double2 ph = phs[batch * NMODES + k]; const double real = shared_fft[fft_id_in_block][k].x; const double imag = shared_fft[fft_id_in_block][k].y; shared_fft[fft_id_in_block][k].x = real * ph.x - imag * ph.y; shared_fft[fft_id_in_block][k].y = real * ph.y + imag * ph.x; } - for (int k = tid + nmodes; k < NZ; k += threads_per_fft) { - if (k >= nmodes) { + for (int k = tid + NMODES; k < NZ; k += threads_per_fft) { + if (k >= NMODES) { const int kk = NZ - k; - const double2 tmp = phs[batch * nmodes + kk]; + const double2 tmp = phs[batch * NMODES + kk]; const double real = shared_fft[fft_id_in_block][k].x; const double imag = shared_fft[fft_id_in_block][k].y; shared_fft[fft_id_in_block][k].x = real * tmp.x + imag * tmp.y; @@ -418,18 +415,15 @@ fft_block_cooperative(const BoutReal** __restrict__ in, BoutReal** __restrict__ } // Launcher for block-level cooperative FFT -static void shiftZ_block_fft(Mesh& mesh, const BoutReal** in, BoutReal** out, - const double2** phs, int nblocks, int batches, +static void shiftZ_block_fft(const int Nz, const BoutReal** in, BoutReal** out, + const double2** phs, int nblocks, int nbatches, cudaStream_t stream = 0) { - int Nz = mesh.LocalNz; - int nmodes = Nz / 2 + 1; - if ((Nz & (Nz - 1)) != 0) { fprintf(stderr, "Error: Nz=%d must be power of 2\n", Nz); return; } - const int total_ffts = nblocks * batches; + const int total_ffts = nblocks * nbatches; if (Nz == 16) { constexpr int FFTS_PER_BLOCK = 16; @@ -439,7 +433,7 @@ static void shiftZ_block_fft(Mesh& mesh, const BoutReal** in, BoutReal** out, dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); fft_block_cooperative<16, FFTS_PER_BLOCK> - <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + <<>>(in, out, phs, nbatches, nblocks); } else if (Nz == 64) { constexpr int FFTS_PER_BLOCK = 4; constexpr int THREADS_PER_FFT = 64; // Use 64 threads per FFT @@ -448,7 +442,7 @@ static void shiftZ_block_fft(Mesh& mesh, const BoutReal** in, BoutReal** out, dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); fft_block_cooperative<64, FFTS_PER_BLOCK> - <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + <<>>(in, out, phs, nbatches, nblocks); } else if (Nz == 128) { constexpr int FFTS_PER_BLOCK = 2; @@ -458,7 +452,7 @@ static void shiftZ_block_fft(Mesh& mesh, const BoutReal** in, BoutReal** out, dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); fft_block_cooperative<128, FFTS_PER_BLOCK> - <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + <<>>(in, out, phs, nbatches, nblocks); } else if (Nz == 256) { constexpr int FFTS_PER_BLOCK = 1; @@ -468,7 +462,7 @@ static void shiftZ_block_fft(Mesh& mesh, const BoutReal** in, BoutReal** out, dim3 grid(total_ffts); fft_block_cooperative<256, FFTS_PER_BLOCK> - <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + <<>>(in, out, phs, nbatches, nblocks); } else if (Nz == 512) { constexpr int FFTS_PER_BLOCK = 1; @@ -478,10 +472,9 @@ static void shiftZ_block_fft(Mesh& mesh, const BoutReal** in, BoutReal** out, dim3 grid(total_ffts); fft_block_cooperative<512, FFTS_PER_BLOCK> - <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + <<>>(in, out, phs, nbatches, nblocks); } else { - fprintf(stderr, "Unsupported Nz=%d for block FFT\n", Nz); - throw std::runtime_error("Unsupported Nz for block FFT"); + throw std::runtime_error("Unsupported Nz " + std::to_string(Nz) + " for block FFT"); } cudaError_t err = cudaGetLastError(); @@ -516,16 +509,30 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { f_slice.allocate(); #if BOUT_HAS_CUDA + static struct StreamRAII { + cudaStream_t stream = 0; + StreamRAII() { + if (cudaStreamCreate(&stream) != cudaSuccess) { + throw BoutException("Failed to create CUDA stream"); + } + } + + cudaStream_t get() const { return stream; } + + void synchronize() const { cudaStreamSynchronize(stream); } + + ~StreamRAII() { cudaStreamDestroy(stream); } + } stream; size_t block_idx = 0; - int num_batches = + int nbatches = region.getBlocks().cbegin()->second.ind - region.getBlocks().cbegin()->first.ind; for (auto block = region.getBlocks().cbegin(), end = region.getBlocks().cend(); block < end; ++block) { auto idx_s = block->first; auto idx_e = block->second; - int inner_batches = idx_e.ind - idx_s.ind; - if (inner_batches != num_batches) { + int inner_nbatches = idx_e.ind - idx_s.ind; + if (inner_nbatches != nbatches) { throw BoutException( "Non-uniform number of batches in ShiftedMetric::calcParallelSlices"); } @@ -540,12 +547,11 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { block_idx++; } - shiftZ_block_fft(mesh, &blocks_in[0], &blocks_out[0], &phs_in[0], nblocks, - num_batches, 0); + shiftZ_block_fft(mesh.LocalNz, &blocks_in[0], &blocks_out[0], &phs_in[0], nblocks, + nbatches, stream.get()); - cudaDeviceSynchronize(); + stream.synchronize(); #else - // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_NOY")) { const int ix = i.x(); const int iy = i.y(); @@ -553,8 +559,6 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { shiftZ(&(f(ix, iy_offset, 0)), &(phase.phase_shift(ix, iy, 0)), &(f_slice(ix, iy_offset, 0))); } - //std::cout << "ShiftedMetric::shiftZ " << __FILE__ << " :" << __LINE__ - // << " count = " << count << " each size " << mesh.LocalNz << "\n"; #endif } } @@ -572,7 +576,6 @@ ShiftedMetric::shiftZ(const Field3D& f, Matrix> f_fft(mesh.LocalNx, mesh.LocalNy); f_fft = Array(nmodes); - // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_ALL")) { int ix = i.x(); int iy = i.y(); @@ -587,7 +590,6 @@ ShiftedMetric::shiftZ(const Field3D& f, current_result.allocate(); current_result.setLocation(f.getLocation()); - // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_NOY")) { // Deep copy the FFT'd field int ix = i.x(); From b6c738c449f9ff32d853687c89169a995bc3a4a2 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 24 Oct 2025 10:25:31 -0700 Subject: [PATCH 25/58] Fixup: remove comments, avoid temp for inverse --- src/mesh/parallel/shiftedmetric.cxx | 36 +++++++++++++---------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/src/mesh/parallel/shiftedmetric.cxx b/src/mesh/parallel/shiftedmetric.cxx index 1b92313117..8a71e082d6 100644 --- a/src/mesh/parallel/shiftedmetric.cxx +++ b/src/mesh/parallel/shiftedmetric.cxx @@ -227,7 +227,6 @@ void ShiftedMetric::shiftZ(const BoutReal* in, const dcomplex* phs, BoutReal* ou irfft(&cmplx[0], mesh.LocalNz, out); // Reverse FFT } -/* NEW CODE */ // Bit-reversal __device__ inline unsigned int bit_reverse(unsigned int x, unsigned int log2n) { unsigned int result = 0; @@ -364,16 +363,15 @@ __global__ void fft_block_cooperative(const BoutReal** __restrict__ in, } __syncthreads(); - // Bit-reverse for inverse - __shared__ double2 temp_fft[FFTS_PER_BLOCK][NZ]; - for (int i = tid; i < NZ; i += threads_per_fft) { + // Bit-reverse with standard swap to avoid temp array + // This is tricky but saves memory + for (int i = tid; i < NZ / 2; i += threads_per_fft) { const unsigned int rev_i = bit_reverse(i, LOG2_NZ); - temp_fft[fft_id_in_block][rev_i] = shared_fft[fft_id_in_block][i]; - } - __syncthreads(); - - for (int i = tid; i < NZ; i += threads_per_fft) { - shared_fft[fft_id_in_block][i] = temp_fft[fft_id_in_block][i]; + if (i < rev_i) { // Only swap once per pair + double2 temp = shared_fft[fft_id_in_block][i]; + shared_fft[fft_id_in_block][i] = shared_fft[fft_id_in_block][rev_i]; + shared_fft[fft_id_in_block][rev_i] = temp; + } } __syncthreads(); @@ -427,18 +425,18 @@ static void shiftZ_block_fft(const int Nz, const BoutReal** in, BoutReal** out, if (Nz == 16) { constexpr int FFTS_PER_BLOCK = 16; - constexpr int THREADS_PER_FFT = 16; // Use 64 threads per FFT + constexpr int THREADS_PER_FFT = 16; - dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 16 x 16 = 256 threads + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); fft_block_cooperative<16, FFTS_PER_BLOCK> <<>>(in, out, phs, nbatches, nblocks); } else if (Nz == 64) { constexpr int FFTS_PER_BLOCK = 4; - constexpr int THREADS_PER_FFT = 64; // Use 64 threads per FFT + constexpr int THREADS_PER_FFT = 64; - dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 64 x 4 = 256 threads + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); fft_block_cooperative<64, FFTS_PER_BLOCK> @@ -448,7 +446,7 @@ static void shiftZ_block_fft(const int Nz, const BoutReal** in, BoutReal** out, constexpr int FFTS_PER_BLOCK = 2; constexpr int THREADS_PER_FFT = 128; - dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 128 x 2 = 256 threads + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); fft_block_cooperative<128, FFTS_PER_BLOCK> @@ -458,7 +456,7 @@ static void shiftZ_block_fft(const int Nz, const BoutReal** in, BoutReal** out, constexpr int FFTS_PER_BLOCK = 1; constexpr int THREADS_PER_FFT = 256; - dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 256 x 1 = 256 threads + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); dim3 grid(total_ffts); fft_block_cooperative<256, FFTS_PER_BLOCK> @@ -466,9 +464,9 @@ static void shiftZ_block_fft(const int Nz, const BoutReal** in, BoutReal** out, } else if (Nz == 512) { constexpr int FFTS_PER_BLOCK = 1; - constexpr int THREADS_PER_FFT = 512; // 512 threads per FFT + constexpr int THREADS_PER_FFT = 512; - dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 512 x 1 = 512 threads + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); dim3 grid(total_ffts); fft_block_cooperative<512, FFTS_PER_BLOCK> @@ -483,8 +481,6 @@ static void shiftZ_block_fft(const int Nz, const BoutReal** in, BoutReal** out, } } -/* END NEWER CODE */ - void ShiftedMetric::calcParallelSlices(Field3D& f) { if (f.getDirectionY() == YDirectionType::Aligned) { // Cannot calculate parallel slices for field-aligned fields, so return without From 11ebfcd4e9874c7b181f2406e37227dbf4038fd2 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 24 Oct 2025 10:43:12 -0700 Subject: [PATCH 26/58] Fixup: preprocessor guards, better variable naming --- src/mesh/parallel/shiftedmetric.cxx | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/mesh/parallel/shiftedmetric.cxx b/src/mesh/parallel/shiftedmetric.cxx index 8a71e082d6..c05550e348 100644 --- a/src/mesh/parallel/shiftedmetric.cxx +++ b/src/mesh/parallel/shiftedmetric.cxx @@ -227,6 +227,7 @@ void ShiftedMetric::shiftZ(const BoutReal* in, const dcomplex* phs, BoutReal* ou irfft(&cmplx[0], mesh.LocalNz, out); // Reverse FFT } +#if BOUT_HAS_CUDA // Bit-reversal __device__ inline unsigned int bit_reverse(unsigned int x, unsigned int log2n) { unsigned int result = 0; @@ -244,7 +245,7 @@ template __global__ void fft_block_cooperative(const BoutReal** __restrict__ in, BoutReal** __restrict__ out, const double2** __restrict__ blocks_phs, - const int batches, const int nblocks) { + const int nbatches, const int nblocks) { constexpr int LOG2_NZ = __builtin_ctz(NZ); constexpr double INV_NZ = 1.0 / (double)NZ; @@ -276,11 +277,11 @@ __global__ void fft_block_cooperative(const BoutReal** __restrict__ in, threadIdx.y; // Which FFT this thread works on (0 to FFTS_PER_BLOCK-1) const int global_fft_id = blockIdx.x * FFTS_PER_BLOCK + fft_id_in_block; - if (global_fft_id >= nblocks * batches) + if (global_fft_id >= nblocks * nbatches) return; - const int block = global_fft_id / batches; - const int batch = global_fft_id % batches; + const int block = global_fft_id / nbatches; + const int batch = global_fft_id % nbatches; const double* __restrict__ in_line = in[block] + batch * NZ; double* __restrict__ out_line = out[block] + batch * NZ; @@ -480,6 +481,7 @@ static void shiftZ_block_fft(const int Nz, const BoutReal** in, BoutReal** out, throw std::runtime_error(std::string("Block FFT failed: ") + cudaGetErrorString(err)); } } +#endif void ShiftedMetric::calcParallelSlices(Field3D& f) { if (f.getDirectionY() == YDirectionType::Aligned) { @@ -490,8 +492,8 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { f.splitParallelSlices(); +#if BOUT_HAS_CUDA auto& region = mesh.getRegion2D("RGN_NOY"); - static size_t nblocks = region.getBlocks().size(); if (nblocks != region.getBlocks().size()) { throw BoutException("Number of blocks changed in ShiftedMetric::calcParallelSlices"); @@ -499,6 +501,7 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { static Array blocks_in(nblocks); static Array blocks_out(nblocks); static Array phs_in(nblocks); +#endif for (const auto& phase : parallel_slice_phases) { auto& f_slice = f.ynext(phase.y_offset); From cf5a88291036bc20a5922a5fb1563dc8221623f5 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 24 Oct 2025 11:15:49 -0700 Subject: [PATCH 27/58] Fixup: saner split with BOUT_HAS_CUDA --- src/mesh/parallel/shiftedmetric.cxx | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/mesh/parallel/shiftedmetric.cxx b/src/mesh/parallel/shiftedmetric.cxx index c05550e348..981034c1ee 100644 --- a/src/mesh/parallel/shiftedmetric.cxx +++ b/src/mesh/parallel/shiftedmetric.cxx @@ -501,13 +501,11 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { static Array blocks_in(nblocks); static Array blocks_out(nblocks); static Array phs_in(nblocks); -#endif for (const auto& phase : parallel_slice_phases) { auto& f_slice = f.ynext(phase.y_offset); f_slice.allocate(); -#if BOUT_HAS_CUDA static struct StreamRAII { cudaStream_t stream = 0; StreamRAII() { @@ -550,7 +548,12 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { nbatches, stream.get()); stream.synchronize(); + } #else + for (const auto& phase : parallel_slice_phases) { + auto& f_slice = f.ynext(phase.y_offset); + f_slice.allocate(); + BOUT_FOR(i, mesh.getRegion2D("RGN_NOY")) { const int ix = i.x(); const int iy = i.y(); @@ -558,8 +561,8 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { shiftZ(&(f(ix, iy_offset, 0)), &(phase.phase_shift(ix, iy, 0)), &(f_slice(ix, iy_offset, 0))); } -#endif } +#endif } std::vector From 9f42dcbf7ffadecde2ab51c1dc308549910d1b61 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 24 Oct 2025 18:28:25 -0700 Subject: [PATCH 28/58] Fixup: remove redundant conditional --- src/mesh/parallel/shiftedmetric.cxx | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/mesh/parallel/shiftedmetric.cxx b/src/mesh/parallel/shiftedmetric.cxx index 981034c1ee..45f2988561 100644 --- a/src/mesh/parallel/shiftedmetric.cxx +++ b/src/mesh/parallel/shiftedmetric.cxx @@ -346,14 +346,12 @@ __global__ void fft_block_cooperative(const BoutReal** __restrict__ in, } for (int k = tid + NMODES; k < NZ; k += threads_per_fft) { - if (k >= NMODES) { - const int kk = NZ - k; - const double2 tmp = phs[batch * NMODES + kk]; - const double real = shared_fft[fft_id_in_block][k].x; - const double imag = shared_fft[fft_id_in_block][k].y; - shared_fft[fft_id_in_block][k].x = real * tmp.x + imag * tmp.y; - shared_fft[fft_id_in_block][k].y = -real * tmp.y + imag * tmp.x; - } + const int kk = NZ - k; + const double2 tmp = phs[batch * NMODES + kk]; + const double real = shared_fft[fft_id_in_block][k].x; + const double imag = shared_fft[fft_id_in_block][k].y; + shared_fft[fft_id_in_block][k].x = real * tmp.x + imag * tmp.y; + shared_fft[fft_id_in_block][k].y = -real * tmp.y + imag * tmp.x; } __syncthreads(); From 35e6f423859c4f6b5554f005e31576f18cc23c56 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Mon, 24 Nov 2025 12:31:35 -0800 Subject: [PATCH 29/58] Use streams to reduce synchronization overhead --- include/bout/bout_types.hxx | 7 ++-- include/bout/field2d.hxx | 6 +-- include/bout/field3d.hxx | 40 ++++++++++---------- include/bout/fieldops.hxx | 57 +++++++++++++++++++++++++---- include/bout/rajalib.hxx | 14 ------- src/mesh/parallel/shiftedmetric.cxx | 46 +++++++++++++---------- 6 files changed, 103 insertions(+), 67 deletions(-) diff --git a/include/bout/bout_types.hxx b/include/bout/bout_types.hxx index c725c281d3..03bc4dcee4 100644 --- a/include/bout/bout_types.hxx +++ b/include/bout/bout_types.hxx @@ -2,7 +2,7 @@ * Copyright 2010 B.D.Dudson, S.Farley, M.V.Umansky, X.Q.Xu * * Contact Ben Dudson, bd512@york.ac.uk - * + * * This file is part of BOUT++. * * BOUT++ is free software: you can redistribute it and/or modify @@ -145,8 +145,9 @@ struct Constant { T val; struct View { T v; - View(T v) : v(v) {} - __host__ __device__ T operator()(int) const { return v; } + cudaStream_t stream = 0; + View(T v) : v(v) {} + __host__ __device__ T operator()(int) const { return v; } }; operator View() const { return {val}; } }; diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index da8de551ad..b452df7fd6 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -273,10 +273,10 @@ public: #define FIELD2D_OP_EQUALS(OP_SYM) \ template \ std::enable_if_t || is_expr_constant_v, Field2D&> \ - operator OP_SYM##=(R rhs) { \ + operator OP_SYM## = (R rhs) { \ if (data.unique()) { \ - auto BE = (*this)OP_SYM rhs; \ - BE.evaluate(&data[0]); \ + auto expr = (*this)OP_SYM rhs; \ + expr.evaluate(&data[0]); \ } else { \ (*this) = (*this)OP_SYM rhs; \ } \ diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 62b299bc48..9dc064d6e6 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -2,7 +2,7 @@ * Copyright 2010 B.D.Dudson, S.Farley, M.V.Umansky, X.Q.Xu * * Contact: Ben Dudson, bd512@york.ac.uk - * + * * This file is part of BOUT++. * * BOUT++ is free software: you can redistribute it and/or modify @@ -355,7 +355,7 @@ public: * Direct access to the underlying data array * * If CHECK > 2 then bounds checking is performed - * + * * If CHECK <= 2 then no checks are performed, to * allow inlining and optimisation of inner loops */ @@ -473,19 +473,19 @@ public: ///@} -#define FIELD3D_OP_EQUALS(OP_SYM) \ - template \ - std::enable_if_t || is_expr_field2d_v \ - || is_expr_constant_v, \ - Field3D&> operator OP_SYM##=(const R & rhs) { \ - if (data.unique()) { \ - clearParallelSlices(); \ - auto Expr = (*this)OP_SYM rhs; \ - Expr.evaluate(&data[0]); \ - } else { \ - (*this) = (*this)OP_SYM rhs; \ - } \ - return *this; \ +#define FIELD3D_OP_EQUALS(OP_SYM) \ + template \ + std::enable_if_t< \ + is_expr_field3d_v || is_expr_field2d_v || is_expr_constant_v, Field3D&> \ + operator OP_SYM## = (const R& rhs) { \ + if (data.unique()) { \ + clearParallelSlices(); \ + auto expr = (*this)OP_SYM rhs; \ + expr.evaluate(&data[0]); \ + } else { \ + (*this) = (*this)OP_SYM rhs; \ + } \ + return *this; \ } FIELD3D_OP_EQUALS(+) @@ -565,8 +565,8 @@ FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); #define FIELD3D_FIELD3D_FIELD3D_OP(OP_SYM, OP_TYPE) \ template && is_expr_field3d_v>> \ - BinaryExpr operator OP_SYM(const L & lhs, \ - const R & rhs) { \ + BinaryExpr operator OP_SYM(const L& lhs, \ + const R& rhs) { \ auto regionID = \ lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); \ return BinaryExpr{ \ @@ -590,7 +590,7 @@ FIELD3D_FIELD3D_FIELD3D_OP(/, Div) template \ std::enable_if_t && is_expr_field2d_v, \ BinaryExpr> \ - operator OP_SYM(const L & lhs, const R & rhs) { \ + operator OP_SYM(const L& lhs, const R& rhs) { \ auto regionID = lhs.getRegionID(); \ int mesh_nz = lhs.getMesh()->LocalNz; \ return BinaryExpr{ \ @@ -613,7 +613,7 @@ FIELD3D_FIELD3D_FIELD2D_OP(/, Div) template \ std::enable_if_t && is_expr_constant_v, \ BinaryExpr, bout::op::OP_TYPE>> \ - operator OP_SYM(const L & lhs, R rhs) { \ + operator OP_SYM(const L& lhs, R rhs) { \ auto regionID = lhs.getRegionID(); \ return BinaryExpr, bout::op::OP_TYPE>{ \ static_cast(lhs), \ @@ -635,7 +635,7 @@ FIELD3D_FIELD3D_BOUTREAL_OP(/, Div) template \ std::enable_if_t && is_expr_field3d_v, \ BinaryExpr, R, bout::op::OP_TYPE>> \ - operator OP_SYM(const L & lhs, const R & rhs) { \ + operator OP_SYM(const L& lhs, const R& rhs) { \ auto regionID = rhs.getRegionID(); \ return BinaryExpr, R, bout::op::OP_TYPE>{ \ static_cast::View>(lhs), \ diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 48d104e3ea..114793ebc8 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -130,6 +130,40 @@ __global__ void __launch_bounds__(THREADS) evaluatorExpr(BoutReal* out, const Ex inline std::unordered_map> regionIndicesCache; +struct StreamsRAII { + std::vector streams; + + cudaStream_t get() { + cudaStream_t stream = 0; + + if (streams.empty()) { + if (cudaStreamCreate(&stream) != cudaSuccess) { + throw BoutException("Failed to create CUDA stream"); + } + } else { + stream = streams.back(); + streams.pop_back(); + } + + return stream; + } + + void put(cudaStream_t stream) { streams.push_back(stream); } + + ~StreamsRAII() { + for (auto& stream : streams) { + cudaStreamDestroy(stream); + } + } + + StreamsRAII() = default; + StreamsRAII(const StreamsRAII&) = delete; + StreamsRAII(StreamsRAII&&) = delete; + StreamsRAII& operator=(const StreamsRAII&) = delete; + StreamsRAII& operator=(StreamsRAII&&) = delete; +}; +inline struct StreamsRAII streams; + template struct BinaryExpr { typename L::View lhs; @@ -220,16 +254,23 @@ struct BinaryExpr { operator View() const { return View{lhs, rhs, &indices[0], indices.size(), f}; } void evaluate(BoutReal* data) const { +#if 1 + cudaStream_t stream = streams.get(); int blocks = (size() + THREADS - 1) / THREADS; - evaluatorExpr<<>>(&data[0], static_cast(*this)); - cudaDeviceSynchronize(); + evaluatorExpr<<>>(&data[0], static_cast(*this)); + cudaStreamSynchronize(stream); + streams.put(stream); +#endif + +#if 0 // OpenMP impl. - //int e = size(); + int e = size(); //#pragma omp parallel for - //for (int i = 0; i < e; ++i) { - // int idx = regionIdx(i); - // data[idx] = operator()(idx); // single‐pass fusion - //} + for (int i = 0; i < e; ++i) { + int idx = regionIdx(i); + data[idx] = operator()(idx); // single‐pass fusion + } +#endif } Mesh* getMesh() const { return mesh; } @@ -238,4 +279,4 @@ struct BinaryExpr { std::optional getRegionID() const { return regionID; }; }; -#endif // BOUT_EXPRESSION_HX \ No newline at end of file +#endif // BOUT_FIELDSOPS_HXX diff --git a/include/bout/rajalib.hxx b/include/bout/rajalib.hxx index d61a58e0d8..20929304b5 100644 --- a/include/bout/rajalib.hxx +++ b/include/bout/rajalib.hxx @@ -139,20 +139,6 @@ private: #define BOUT_FOR_RAJA(index, region, ...) \ RajaForAll(region) << [ =, ##__VA_ARGS__ ] RAJA_DEVICE(int index) mutable -// NEW STUFF - -template -__global__ void evaluator(BoutReal *out, Expr &expr) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - for (int i = tid; i < expr.size(); i += stride) { - out[expr.regionIdx(i)] = expr(expr.regionIdx(i)); // single‐pass fusion - } -} - -// END OF NEW STUFF - - #else // BOUT_HAS_RAJA #warning RAJA not enabled. BOUT_FOR_RAJA falling back to BOUT_FOR. diff --git a/src/mesh/parallel/shiftedmetric.cxx b/src/mesh/parallel/shiftedmetric.cxx index 45f2988561..40085eaf92 100644 --- a/src/mesh/parallel/shiftedmetric.cxx +++ b/src/mesh/parallel/shiftedmetric.cxx @@ -496,32 +496,39 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { if (nblocks != region.getBlocks().size()) { throw BoutException("Number of blocks changed in ShiftedMetric::calcParallelSlices"); } - static Array blocks_in(nblocks); - static Array blocks_out(nblocks); - static Array phs_in(nblocks); - for (const auto& phase : parallel_slice_phases) { - auto& f_slice = f.ynext(phase.y_offset); - f_slice.allocate(); - - static struct StreamRAII { - cudaStream_t stream = 0; - StreamRAII() { - if (cudaStreamCreate(&stream) != cudaSuccess) { - throw BoutException("Failed to create CUDA stream"); - } + static struct StreamRAII { + cudaStream_t stream = 0; + StreamRAII() { + if (cudaStreamCreate(&stream) != cudaSuccess) { + throw BoutException("Failed to create CUDA stream"); } + } + + cudaStream_t get() const { return stream; } + + void synchronize() const { cudaStreamSynchronize(stream); } - cudaStream_t get() const { return stream; } + ~StreamRAII() { cudaStreamDestroy(stream); } + } stream; - void synchronize() const { cudaStreamSynchronize(stream); } + // Vector of Arrays for each phase. + std::vector> blocks_in_phase; + std::vector> blocks_out_phase; + std::vector> phs_in_phase; + + for (const auto& phase : parallel_slice_phases) { + auto& f_slice = f.ynext(phase.y_offset); + f_slice.allocate(); - ~StreamRAII() { cudaStreamDestroy(stream); } - } stream; size_t block_idx = 0; int nbatches = region.getBlocks().cbegin()->second.ind - region.getBlocks().cbegin()->first.ind; + Array& blocks_in = blocks_in_phase.emplace_back(nblocks); + Array& blocks_out = blocks_out_phase.emplace_back(nblocks); + Array& phs_in = phs_in_phase.emplace_back(nblocks); + for (auto block = region.getBlocks().cbegin(), end = region.getBlocks().cend(); block < end; ++block) { auto idx_s = block->first; @@ -544,9 +551,10 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { shiftZ_block_fft(mesh.LocalNz, &blocks_in[0], &blocks_out[0], &phs_in[0], nblocks, nbatches, stream.get()); - - stream.synchronize(); } + + // Synchronize to ensure all shifts are complete. + stream.synchronize(); #else for (const auto& phase : parallel_slice_phases) { auto& f_slice = f.ynext(phase.y_offset); From f38dbd145dd28c7bf1319e0ec2e2130626594f09 Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 18 Jun 2026 11:30:12 -0700 Subject: [PATCH 30/58] Fix non-CUDA builds Use compatibility wrappers and include guards so that BOUT++ compiles without CUDA. --- include/bout/bout_types.hxx | 5 +- include/bout/build_config.hxx | 2 + include/bout/field.hxx | 4 +- include/bout/field2d.hxx | 8 +-- include/bout/field3d.hxx | 4 +- include/bout/field_accessor.hxx | 5 +- include/bout/fieldops.hxx | 97 +++++++++++++++++---------------- include/bout/fieldperp.hxx | 4 +- 8 files changed, 66 insertions(+), 63 deletions(-) diff --git a/include/bout/bout_types.hxx b/include/bout/bout_types.hxx index db0698d83c..7747b937b3 100644 --- a/include/bout/bout_types.hxx +++ b/include/bout/bout_types.hxx @@ -22,6 +22,8 @@ #ifndef BOUT_TYPES_H #define BOUT_TYPES_H +#include "bout/build_config.hxx" + #include #include @@ -145,9 +147,8 @@ struct Constant { T val; struct View { T v; - cudaStream_t stream = 0; View(T v) : v(v) {} - __host__ __device__ T operator()(int) const { return v; } + BOUT_HOST_DEVICE T operator()(int) const { return v; } }; operator View() const { return {val}; } }; diff --git a/include/bout/build_config.hxx b/include/bout/build_config.hxx index c10cd07746..60c55abad7 100644 --- a/include/bout/build_config.hxx +++ b/include/bout/build_config.hxx @@ -52,10 +52,12 @@ constexpr auto use_msgstack = static_cast(BOUT_USE_MSGSTACK); #define BOUT_HOST_DEVICE __host__ __device__ #define BOUT_HOST __host__ #define BOUT_DEVICE __device__ +#define BOUT_FORCEINLINE __forceinline__ #else #define BOUT_HOST_DEVICE #define BOUT_HOST #define BOUT_DEVICE +#define BOUT_FORCEINLINE inline #endif #endif // BOUT_BUILD_OPTIONS_HXX diff --git a/include/bout/field.hxx b/include/bout/field.hxx index cd1556c96b..c7f1d23fdd 100644 --- a/include/bout/field.hxx +++ b/include/bout/field.hxx @@ -547,8 +547,8 @@ class Field3DParallel; namespace bout::op { \ struct name { \ template \ - __host__ __device__ BoutReal operator()(int idx, const LView& L, \ - const RView& R) const { \ + BOUT_HOST_DEVICE BoutReal operator()(int idx, const LView& L, \ + const RView& R) const { \ return func(L(idx)); \ } \ }; \ diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index 7d79cd56d6..f3fd6ce4df 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -324,10 +324,10 @@ public: BoutReal* data; int mul = 1; int div = 1; - __host__ __device__ inline BoutReal operator()(int idx) const { + BOUT_HOST_DEVICE inline BoutReal operator()(int idx) const { return data[(idx * mul / div)]; } - __host__ __device__ inline BoutReal& operator[](int idx) const { + BOUT_HOST_DEVICE inline BoutReal& operator[](int idx) const { return data[(idx * mul) / div]; } @@ -340,8 +340,8 @@ public: operator View() { return View{&data[0]}; } operator View() const { return View{const_cast(&data[0])}; } - __device__ inline BoutReal operator()(int i) { return View()(i); } - __device__ inline BoutReal operator()(int i) const { return View()(i); } + BOUT_DEVICE inline BoutReal operator()(int i) { return View()(i); } + BOUT_DEVICE inline BoutReal operator()(int i) const { return View()(i); } private: /// Internal data array. Handles allocation/freeing of memory diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 972322ec8b..e4c7b69ddc 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -460,10 +460,10 @@ public: BoutReal* data; int mul = 1; int div = 1; - __host__ __device__ inline BoutReal operator()(int idx) const { + BOUT_HOST_DEVICE inline BoutReal operator()(int idx) const { return data[(idx * mul) / div]; } - __host__ __device__ inline BoutReal& operator[](int idx) const { + BOUT_HOST_DEVICE inline BoutReal& operator[](int idx) const { return data[(idx * mul) / div]; } diff --git a/include/bout/field_accessor.hxx b/include/bout/field_accessor.hxx index a43420d6b3..7c0ba2eb7c 100644 --- a/include/bout/field_accessor.hxx +++ b/include/bout/field_accessor.hxx @@ -63,8 +63,7 @@ struct FieldAccessor { if (auto* Coords = f.getCoordinates()) { coords = CoordinatesAccessor{Coords}; - } - else { + } else { coords = CoordinatesAccessor{}; } @@ -96,7 +95,7 @@ struct FieldAccessor { /// BOUT_HOST_DEVICE inline const BoutReal& operator[](int ind) const { return data[ind]; } BOUT_HOST_DEVICE inline BoutReal& operator[](int ind) { return data[ind]; } - __device__ inline BoutReal operator()(int i) const { return data[i]; } + BOUT_DEVICE inline BoutReal operator()(int i) const { return data[i]; } BOUT_HOST_DEVICE inline const BoutReal& operator[](const Ind3D& ind) const { return data[ind.ind]; diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 114793ebc8..26e3b82cc5 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -5,10 +5,14 @@ #include "bout/array.hxx" #include "bout/bout_types.hxx" -#include #include #include #include +#include + +#if BOUT_HAS_CUDA +#include +#endif class Mesh; class Field3D; @@ -51,57 +55,55 @@ struct Assign { int scale = 1; int offset = 0; template - __host__ __device__ void operator()(int idx, BoutReal* out, const Expr& expr) const { + BOUT_HOST_DEVICE void operator()(int idx, BoutReal* out, const Expr& expr) const { out[(idx * scale) + offset] = expr.lhs(idx) + expr.rhs(idx); } }; struct Add { template - __host__ __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, - const RView& R) const { + BOUT_HOST_DEVICE BOUT_FORCEINLINE BoutReal operator()(int idx, const LView& L, + const RView& R) const { return L(idx) + R(idx); } - __host__ __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { + BOUT_HOST_DEVICE BOUT_FORCEINLINE BoutReal operator()(BoutReal a, BoutReal b) const { return a + b; } }; - struct Sub { - template - __host__ __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, - const RView& R) const { - return L(idx) - R(idx); - } - __host__ __device__ __forceinline__ BoutReal operator()(BoutReal a, - BoutReal b) const { - return a - b; - } - }; - struct Mul { - template - __host__ __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, - const RView& R) const { - return L(idx) * R(idx); - } - __host__ __device__ __forceinline__ BoutReal operator()(BoutReal a, - BoutReal b) const { - return a * b; - } - }; - struct Div { - template - __host__ __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, - const RView& R) const { - return L(idx) / R(idx); - } - __host__ __device__ __forceinline__ BoutReal operator()(BoutReal a, - BoutReal b) const { - return a / b; - } - }; +struct Sub { + template + BOUT_HOST_DEVICE BOUT_FORCEINLINE BoutReal operator()(int idx, const LView& L, + const RView& R) const { + return L(idx) - R(idx); + } + BOUT_HOST_DEVICE BOUT_FORCEINLINE BoutReal operator()(BoutReal a, BoutReal b) const { + return a - b; + } }; +struct Mul { + template + BOUT_HOST_DEVICE BOUT_FORCEINLINE BoutReal operator()(int idx, const LView& L, + const RView& R) const { + return L(idx) * R(idx); + } + BOUT_HOST_DEVICE BOUT_FORCEINLINE BoutReal operator()(BoutReal a, BoutReal b) const { + return a * b; + } }; +struct Div { + template + BOUT_HOST_DEVICE BOUT_FORCEINLINE BoutReal operator()(int idx, const LView& L, + const RView& R) const { + return L(idx) / R(idx); + } + BOUT_HOST_DEVICE BOUT_FORCEINLINE BoutReal operator()(BoutReal a, BoutReal b) const { + return a / b; + } +}; +}; // namespace op +}; // namespace bout +#if BOUT_HAS_CUDA && defined(__CUDACC__) template __global__ void __launch_bounds__(THREADS) evaluatorExpr(BoutReal* out, const Expr expr) { int tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -127,9 +129,11 @@ __global__ void __launch_bounds__(THREADS) evaluatorExpr(BoutReal* out, const Ex // out[idx] = expr(idx); // single‐pass fusion //} } +#endif inline std::unordered_map> regionIndicesCache; +#if BOUT_HAS_CUDA && defined(__CUDACC__) struct StreamsRAII { std::vector streams; @@ -163,6 +167,7 @@ struct StreamsRAII { StreamsRAII& operator=(StreamsRAII&&) = delete; }; inline struct StreamsRAII streams; +#endif template struct BinaryExpr { @@ -216,7 +221,7 @@ struct BinaryExpr { //} } - BinaryExpr& operator=(BinaryExpr const&) = delete; + BinaryExpr& operator=(const BinaryExpr&) = delete; BinaryExpr& operator=(BinaryExpr&&) = delete; inline int size() const { return indices.size(); } @@ -240,11 +245,11 @@ struct BinaryExpr { this->div = div; return *this; } - __host__ __device__ __forceinline__ int size() const { return num_indices; } - __host__ __device__ __forceinline__ int regionIdx(int idx) const { + BOUT_HOST_DEVICE BOUT_FORCEINLINE int size() const { return num_indices; } + BOUT_HOST_DEVICE BOUT_FORCEINLINE int regionIdx(int idx) const { return indices[idx]; } - __host__ __device__ __forceinline__ BoutReal operator()(int idx) const { + BOUT_HOST_DEVICE BOUT_FORCEINLINE BoutReal operator()(int idx) const { return f((idx * mul) / div, lhs, rhs); // single‐pass fusion //return f(lhs((idx * mul) / div), rhs((idx * mul) / div)); // single‐pass fusion } @@ -254,18 +259,14 @@ struct BinaryExpr { operator View() const { return View{lhs, rhs, &indices[0], indices.size(), f}; } void evaluate(BoutReal* data) const { -#if 1 +#if BOUT_HAS_CUDA && defined(__CUDACC__) cudaStream_t stream = streams.get(); int blocks = (size() + THREADS - 1) / THREADS; evaluatorExpr<<>>(&data[0], static_cast(*this)); cudaStreamSynchronize(stream); streams.put(stream); -#endif - -#if 0 - // OpenMP impl. +#else int e = size(); - //#pragma omp parallel for for (int i = 0; i < e; ++i) { int idx = regionIdx(i); data[idx] = operator()(idx); // single‐pass fusion diff --git a/include/bout/fieldperp.hxx b/include/bout/fieldperp.hxx index 98da0e4a94..b00ec1dcb4 100644 --- a/include/bout/fieldperp.hxx +++ b/include/bout/fieldperp.hxx @@ -324,10 +324,10 @@ public: BoutReal* data; int mul = 1; int div = 1; - __host__ __device__ inline BoutReal operator()(int idx) const { + BOUT_HOST_DEVICE inline BoutReal operator()(int idx) const { return data[(idx * mul) / div]; } - __host__ __device__ inline BoutReal& operator[](int idx) const { + BOUT_HOST_DEVICE inline BoutReal& operator[](int idx) const { return data[(idx * mul) / div]; } From 98b9e5508d33fe0e5eea6105131162e99841ff7e Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 18 Jun 2026 11:58:51 -0700 Subject: [PATCH 31/58] More fixes, remove duplications --- include/bout/field.hxx | 80 +++++++++--------- include/bout/field2d.hxx | 2 - include/bout/field3d.hxx | 167 +++---------------------------------- include/bout/fieldperp.hxx | 5 +- include/bout/vector3d.hxx | 6 +- 5 files changed, 55 insertions(+), 205 deletions(-) diff --git a/include/bout/field.hxx b/include/bout/field.hxx index c7f1d23fdd..e707b62816 100644 --- a/include/bout/field.hxx +++ b/include/bout/field.hxx @@ -543,47 +543,45 @@ class Field3DParallel; #ifdef FIELD_FUNC #error This macro has already been defined #else -#define FIELD_FUNC(name, func) \ - namespace bout::op { \ - struct name { \ - template \ - BOUT_HOST_DEVICE BoutReal operator()(int idx, const LView& L, \ - const RView& R) const { \ - return func(L(idx)); \ - } \ - }; \ - }; \ - template > \ - inline auto name(const T& f, const std::string& rgn = "RGN_ALL") { \ - if constexpr (std::is_same_v) { \ - /* Check if the input is allocated */ \ - checkData(f); \ - /* Define and allocate the output result */ \ - T result{emptyFrom(f)}; \ - BOUT_FOR(d, result.getRegion(rgn)) { result[d] = func(f[d]); } \ - for (int i = 0; i < f.numberParallelSlices(); ++i) { \ - result.yup(i) = func(f.yup(i)); \ - result.ydown(i) = func(f.ydown(i)); \ - } \ - result.name = std::string(#name "(") + f.name + std::string(")"); \ - checkData(result); \ - return result; \ - } else { \ - std::cout << "RUNNING " #name " with CUDA\n"; \ - return BinaryExpr{static_cast(f), \ - static_cast(f), \ - bout::op::name{}, \ - f.getMesh(), \ - f.getLocation(), \ - f.getDirections(), \ - std::nullopt, \ - f.getRegion(rgn)}; \ - } \ - } \ - template \ - inline auto name(const BinaryExpr& f, \ - const std::string& rgn = "RGN_ALL") { \ - return name(ResT{f}, rgn); \ +#define FIELD_FUNC(name, func) \ + namespace bout::op { \ + struct name { \ + template \ + BOUT_HOST_DEVICE BoutReal operator()(int idx, const LView& L, const RView&) const { \ + return func(L(idx)); \ + } \ + }; \ + }; \ + template > \ + inline auto name(const T& f, const std::string& rgn = "RGN_ALL") { \ + if constexpr (std::is_same_v) { \ + /* Check if the input is allocated */ \ + checkData(f); \ + /* Define and allocate the output result */ \ + T result{emptyFrom(f)}; \ + BOUT_FOR(d, result.getRegion(rgn)) { result[d] = func(f[d]); } \ + for (int i = 0; i < f.numberParallelSlices(); ++i) { \ + result.yup(i) = func(f.yup(i)); \ + result.ydown(i) = func(f.ydown(i)); \ + } \ + result.name = std::string(#name "(") + f.name + std::string(")"); \ + checkData(result); \ + return result; \ + } else { \ + return BinaryExpr{static_cast(f), \ + static_cast(f), \ + bout::op::name{}, \ + f.getMesh(), \ + f.getLocation(), \ + f.getDirections(), \ + std::nullopt, \ + f.getRegion(rgn)}; \ + } \ + } \ + template \ + inline auto name(const BinaryExpr& f, \ + const std::string& rgn = "RGN_ALL") { \ + return name(ResT{f}, rgn); \ } #endif diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index f3fd6ce4df..7e3ca7506d 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -116,7 +116,6 @@ public: || (is_expr_constant_v && is_expr_field2d_v) || (is_expr_field2d_v && is_expr_constant_v)>> Field2D(const BinaryExpr& expr) { - std::cout << "RUNNING Field2D constructor with CUDA\n"; Array data{expr.size()}; expr.evaluate(&data[0]); *this = std::move(Field2D{std::move(data), expr.getMesh(), expr.getLocation(), @@ -193,7 +192,6 @@ public: template std::enable_if_t, Field2D&> operator=(const BinaryExpr& expr) { - std::cout << "RUNNING Field2D operator= with CUDA\n"; if (isAllocated()) { expr.evaluate(&data[0]); } else { diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index e4c7b69ddc..2c5bc9d305 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -31,10 +31,12 @@ class Field3D; #include "bout/array.hxx" #include "bout/assert.hxx" #include "bout/bout_types.hxx" +#include "bout/build_config.hxx" #include "bout/field.hxx" #include "bout/field2d.hxx" #include "bout/field_data.hxx" #include "bout/fieldperp.hxx" +#include "bout/mesh.hxx" #include "bout/region.hxx" #include "bout/traits.hxx" @@ -43,6 +45,7 @@ class Field3D; #include #include #include +#include #include #include @@ -200,7 +203,6 @@ public: template || is_expr_field3d_v>> Field3D(const BinaryExpr& expr) { - //std::cout << "RUNNING constructor from BinaryExpr\n"; Array data{expr.size()}; expr.evaluate(&data[0]); *this = std::move(Field3D{std::move(data), expr.getMesh(), expr.getLocation(), @@ -491,7 +493,6 @@ public: template std::enable_if_t, Field3D&> operator=(BinaryExpr& expr) { - std::cout << "RUNNING operator= with CUDA\n"; regionID = expr.getRegionID(); if (isAllocated()) { expr.evaluate(&data[0]); @@ -964,149 +965,17 @@ Field3DParallel Field3D::asField3DParallel() { } Field3DParallel Field3D::asField3DParallel() const { return Field3DParallel(*this); } -inline Field3D operator+(const Field2D& lhs, const Field3DParallel& rhs) { - return lhs + rhs.asField3D(); -} -inline Field3D operator-(const Field2D& lhs, const Field3DParallel& rhs) { - return lhs + rhs.asField3D(); -} -inline Field3D operator*(const Field2D& lhs, const Field3DParallel& rhs) { - return lhs + rhs.asField3D(); -} -inline Field3D operator/(const Field2D& lhs, const Field3DParallel& rhs) { - return lhs + rhs.asField3D(); -} - -inline Field3D operator+(const Field3DParallel& lhs, const Field2D& rhs) { - return lhs.asField3D() + rhs; -} -inline Field3D operator-(const Field3DParallel& lhs, const Field2D& rhs) { - return lhs.asField3D() - rhs; -} -inline Field3D operator*(const Field3DParallel& lhs, const Field2D& rhs) { - return lhs.asField3D() * rhs; -} -inline Field3D operator/(const Field3DParallel& lhs, const Field2D& rhs) { - return lhs.asField3D() / rhs; -} - -inline Field3DParallel -filledFrom(const Field3DParallel& f, - const std::function& func) { - auto result{emptyFrom(f)}; - if (f.isFci()) { - BOUT_FOR(i, result.getRegion("RGN_NOY")) { result[i] = func(0, i); } - - for (size_t i = 0; i < result.numberParallelSlices(); ++i) { - result.yup(i).allocate(); - BOUT_FOR(d, result.yup(i).getValidRegionWithDefault("RGN_INVALID")) { - result.yup(i)[d] = func(i + 1, d); - } - result.ydown(i).allocate(); - BOUT_FOR(d, result.ydown(i).getValidRegionWithDefault("RGN_INVALID")) { - result.ydown(i)[d] = func(-i - 1, d); - } - } - } else { - BOUT_FOR(i, result.getRegion("RGN_ALL")) { result[i] = func(0, i); } - } - - return result; -} - -inline Field3D copy(const Field3D& f) { - Field3D result{f}; - result.allocate(); - for (size_t i = 0; i < result.numberParallelSlices(); ++i) { - result.yup(i).allocate(); - result.ydown(i).allocate(); - } - return result; -} - -/// Field3DParallel is intended to behave like Field3D, but preserve parallel -/// Fields. -/// Operations on Field3D, like multiplication, exp and floor only work on the -/// "main" field, Field3DParallel will retain the parallel slices. -class Field3DParallel : public Field3D { -public: - template - explicit Field3DParallel(Types... args) : Field3D(std::move(args)...) { - ensureFieldAligned(); - } - Field3DParallel(const Field3D& f) : Field3D(f) { ensureFieldAligned(); } - Field3DParallel(const Field3D& f, bool isRef) : Field3D(f), isRef(isRef) { - ensureFieldAligned(); - } - Field3DParallel(const Field2D& f) : Field3D(f) { ensureFieldAligned(); } - // Explicitly needed, as DirectionTypes is sometimes constructed from a - // brace enclosed list - explicit Field3DParallel(Mesh* localmesh = nullptr, CELL_LOC location_in = CELL_CENTRE, - DirectionTypes directions_in = {YDirectionType::Standard, - ZDirectionType::Standard}, - std::optional regionID = {}) - : Field3D(localmesh, location_in, directions_in, regionID) { - if (isFci()) { - splitParallelSlices(); - } - ensureFieldAligned(); - } - explicit Field3DParallel(Array data, Mesh* localmesh, - CELL_LOC location = CELL_CENTRE, - DirectionTypes directions_in = {YDirectionType::Standard, - ZDirectionType::Standard}) - : Field3D(std::move(data), localmesh, location, directions_in) { - ensureFieldAligned(); - } - explicit Field3DParallel(BoutReal, Mesh* mesh = nullptr); - Field3D& asField3D() { return *this; } - const Field3D& asField3D() const { return *this; } - - Field3DParallel& operator*=(const Field3D&); - Field3DParallel& operator/=(const Field3D&); - Field3DParallel& operator+=(const Field3D&); - Field3DParallel& operator-=(const Field3D&); - Field3DParallel& operator*=(const Field3DParallel&); - Field3DParallel& operator/=(const Field3DParallel&); - Field3DParallel& operator+=(const Field3DParallel&); - Field3DParallel& operator-=(const Field3DParallel&); - Field3DParallel& operator*=(BoutReal); - Field3DParallel& operator/=(BoutReal); - Field3DParallel& operator+=(BoutReal); - Field3DParallel& operator-=(BoutReal); - Field3DParallel& operator=(const Field3D& rhs) { - Field3D::operator=(rhs); - ensureFieldAligned(); - return *this; - } - Field3DParallel& operator=(Field3D&& rhs) { - Field3D::operator=(std::move(rhs)); - ensureFieldAligned(); - return *this; - } - Field3DParallel& operator=(BoutReal); - Field3DParallel& allocate(); +// A raw Field3D is an expression leaf +template <> +struct is_expr_field3d : std::true_type {}; -private: - void ensureFieldAligned(); - bool isRef{false}; -}; +template <> +struct is_expr_field2d : std::true_type {}; -Field3DParallel Field3D::asField3DParallel() { - if (isAllocated()) { - allocate(); - for (size_t i = 0; i < numberParallelSlices(); ++i) { - if (yup(i).isAllocated()) { - yup(i).allocate(); - } - if (ydown(i).isAllocated()) { - ydown(i).allocate(); - } - } - } - return Field3DParallel(*this, true); -} -Field3DParallel Field3D::asField3DParallel() const { return Field3DParallel(*this); } +template +struct is_expr_field3d> + : std::integral_constant>::value + || is_expr_field3d_v>> {}; inline Field3D operator+(const Field2D& lhs, const Field3DParallel& rhs) { return lhs + rhs.asField3D(); @@ -1158,16 +1027,4 @@ filledFrom(const Field3DParallel& f, return result; } -// A raw Field3D is an expression leaf -template <> -struct is_expr_field3d : std::true_type {}; - -template <> -struct is_expr_field2d : std::true_type {}; - -template -struct is_expr_field3d> - : std::integral_constant>::value - || is_expr_field3d_v>> {}; - #endif /* BOUT_FIELD3D_H */ diff --git a/include/bout/fieldperp.hxx b/include/bout/fieldperp.hxx index b00ec1dcb4..c97028d913 100644 --- a/include/bout/fieldperp.hxx +++ b/include/bout/fieldperp.hxx @@ -2,9 +2,9 @@ * Class for 2D X-Z slices * ************************************************************************** - * Copyright 2010 B.D.Dudson, S.Farley, M.V.Umansky, X.Q.Xu + * Copyright 2010 - 2026 BOUT++ contributors * - * Contact: Ben Dudson, bd512@york.ac.uk + * Contact: Ben Dudson, dudson2@llnl.gov * * This file is part of BOUT++. * @@ -94,7 +94,6 @@ public: typename ResT, typename L, typename R, typename Func, typename = std::enable_if_t<(is_expr_fieldperp_v && is_expr_fieldperp_v)>> FieldPerp(const BinaryExpr& expr) { - std::cout << "RUNNING FieldPerp constructor with CUDA\n"; Array data{expr.size()}; expr.evaluate(&data[0]); *this = std::move(FieldPerp{std::move(data), expr.getMesh(), expr.getLocation(), diff --git a/include/bout/vector3d.hxx b/include/bout/vector3d.hxx index 655a85ca73..73d3912838 100644 --- a/include/bout/vector3d.hxx +++ b/include/bout/vector3d.hxx @@ -3,12 +3,10 @@ * * \brief Class for 3D vectors. Built on the Field3D class. * - * \author B. Dudson, October 2007 - * ************************************************************************** - * Copyright 2010 B.D.Dudson, S.Farley, M.V.Umansky, X.Q.Xu + * Copyright 2010 - 2026 BOUT++ contributors * - * Contact: Ben Dudson, bd512@york.ac.uk + * Contact: Ben Dudson, dudson2@llnl.gov * * This file is part of BOUT++. * From b0fd981bc3a5a59bea82c333d7e1b819663414e0 Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 18 Jun 2026 12:23:37 -0700 Subject: [PATCH 32/58] More fixes Defining missing operators and resolving include issues. --- include/bout/field2d.hxx | 9 ++++++ include/bout/field3d.hxx | 66 +++++++++++++++++++++++++--------------- src/field/field3d.cxx | 33 ++++++++++++++++++++ src/mesh/coordinates.cxx | 1 + src/mesh/difops.cxx | 3 +- 5 files changed, 86 insertions(+), 26 deletions(-) diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index 7e3ca7506d..b9c60aa02f 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -271,6 +271,15 @@ public: return operator()(jx, jy); } + Field2D& operator*=(const Field2D& rhs); + Field2D& operator/=(const Field2D& rhs); + Field2D& operator+=(const Field2D& rhs); + Field2D& operator-=(const Field2D& rhs); + Field2D& operator*=(BoutReal rhs); + Field2D& operator/=(BoutReal rhs); + Field2D& operator+=(BoutReal rhs); + Field2D& operator-=(BoutReal rhs); + #define FIELD2D_OP_EQUALS(OP_SYM) \ template \ std::enable_if_t || is_expr_constant_v, Field2D&> \ diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 2c5bc9d305..81db064b41 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -36,7 +36,6 @@ class Field3D; #include "bout/field2d.hxx" #include "bout/field_data.hxx" #include "bout/fieldperp.hxx" -#include "bout/mesh.hxx" #include "bout/region.hxx" #include "bout/traits.hxx" @@ -526,6 +525,20 @@ public: ///@} + Field3D& operator*=(const Field3D& rhs); + Field3D& operator+=(const Field3D& rhs); + Field3D& operator-=(const Field3D& rhs); + Field3D& operator*=(const Field3DParallel& rhs); + Field3D& operator/=(const Field3DParallel& rhs); + Field3D& operator+=(const Field3DParallel& rhs); + Field3D& operator-=(const Field3DParallel& rhs); + Field3D& operator*=(const Field2D& rhs); + Field3D& operator+=(const Field2D& rhs); + Field3D& operator-=(const Field2D& rhs); + Field3D& operator*=(BoutReal rhs); + Field3D& operator+=(BoutReal rhs); + Field3D& operator-=(BoutReal rhs); + /// Division operators ///@{ Field3D& operator/=(const Field3D& rhs); @@ -965,10 +978,29 @@ Field3DParallel Field3D::asField3DParallel() { } Field3DParallel Field3D::asField3DParallel() const { return Field3DParallel(*this); } +inline Field3D& Field3D::operator*=(const Field3DParallel& rhs) { + return (*this) *= rhs.asField3D(); +} + +inline Field3D& Field3D::operator/=(const Field3DParallel& rhs) { + return (*this) /= rhs.asField3D(); +} + +inline Field3D& Field3D::operator+=(const Field3DParallel& rhs) { + return (*this) += rhs.asField3D(); +} + +inline Field3D& Field3D::operator-=(const Field3DParallel& rhs) { + return (*this) -= rhs.asField3D(); +} + // A raw Field3D is an expression leaf template <> struct is_expr_field3d : std::true_type {}; +template <> +struct is_expr_field3d : std::true_type {}; + template <> struct is_expr_field2d : std::true_type {}; @@ -977,31 +1009,15 @@ struct is_expr_field3d> : std::integral_constant>::value || is_expr_field3d_v>> {}; -inline Field3D operator+(const Field2D& lhs, const Field3DParallel& rhs) { - return lhs + rhs.asField3D(); -} -inline Field3D operator-(const Field2D& lhs, const Field3DParallel& rhs) { - return lhs + rhs.asField3D(); -} -inline Field3D operator*(const Field2D& lhs, const Field3DParallel& rhs) { - return lhs + rhs.asField3D(); -} -inline Field3D operator/(const Field2D& lhs, const Field3DParallel& rhs) { - return lhs + rhs.asField3D(); -} +Field3D operator+(const Field2D& lhs, const Field3DParallel& rhs); +Field3D operator-(const Field2D& lhs, const Field3DParallel& rhs); +Field3D operator*(const Field2D& lhs, const Field3DParallel& rhs); +Field3D operator/(const Field2D& lhs, const Field3DParallel& rhs); -inline Field3D operator+(const Field3DParallel& lhs, const Field2D& rhs) { - return lhs.asField3D() + rhs; -} -inline Field3D operator-(const Field3DParallel& lhs, const Field2D& rhs) { - return lhs.asField3D() - rhs; -} -inline Field3D operator*(const Field3DParallel& lhs, const Field2D& rhs) { - return lhs.asField3D() * rhs; -} -inline Field3D operator/(const Field3DParallel& lhs, const Field2D& rhs) { - return lhs.asField3D() / rhs; -} +Field3D operator+(const Field3DParallel& lhs, const Field2D& rhs); +Field3D operator-(const Field3DParallel& lhs, const Field2D& rhs); +Field3D operator*(const Field3DParallel& lhs, const Field2D& rhs); +Field3D operator/(const Field3DParallel& lhs, const Field2D& rhs); inline Field3DParallel filledFrom(const Field3DParallel& f, diff --git a/src/field/field3d.cxx b/src/field/field3d.cxx index cc2a342f37..238bef1993 100644 --- a/src/field/field3d.cxx +++ b/src/field/field3d.cxx @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -84,6 +85,38 @@ Field3D::Field3D(const Field3D& f) } } +Field3D operator+(const Field2D& lhs, const Field3DParallel& rhs) { + return lhs + rhs.asField3D(); +} + +Field3D operator-(const Field2D& lhs, const Field3DParallel& rhs) { + return lhs - rhs.asField3D(); +} + +Field3D operator*(const Field2D& lhs, const Field3DParallel& rhs) { + return lhs * rhs.asField3D(); +} + +Field3D operator/(const Field2D& lhs, const Field3DParallel& rhs) { + return lhs / rhs.asField3D(); +} + +Field3D operator+(const Field3DParallel& lhs, const Field2D& rhs) { + return lhs.asField3D() + rhs; +} + +Field3D operator-(const Field3DParallel& lhs, const Field2D& rhs) { + return lhs.asField3D() - rhs; +} + +Field3D operator*(const Field3DParallel& lhs, const Field2D& rhs) { + return lhs.asField3D() * rhs; +} + +Field3D operator/(const Field3DParallel& lhs, const Field2D& rhs) { + return lhs.asField3D() / rhs; +} + Field3D::Field3D(const Field2D& f) : Field(f) { nx = fieldmesh->LocalNx; diff --git a/src/mesh/coordinates.cxx b/src/mesh/coordinates.cxx index 6334054533..8980b00695 100644 --- a/src/mesh/coordinates.cxx +++ b/src/mesh/coordinates.cxx @@ -4,6 +4,7 @@ * given the contravariant metric tensor terms **************************************************************************/ +#include "bout/coordinates_accessor.hxx" #include "bout/field3d.hxx" #include "bout/field_data.hxx" #include diff --git a/src/mesh/difops.cxx b/src/mesh/difops.cxx index 09433b0685..902ebfce93 100644 --- a/src/mesh/difops.cxx +++ b/src/mesh/difops.cxx @@ -284,7 +284,8 @@ Field3D Div_par_flux(const Field3D& v, const Field3D& f, CELL_LOC outloc, auto Bxy_floc = f.getCoordinates()->Bxy; if (!f.hasParallelSlices()) { - return metric->Bxy * FDDY(v, f / Bxy_floc, outloc, method) / sqrt(metric->g_22); + Field3D f_B = f / Bxy_floc; + return metric->Bxy * FDDY(v, f_B, outloc, method) / sqrt(metric->g_22); } // Need to modify yup and ydown fields From bb5b495eb7199eee884eec001d36e9f57e86339c Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 18 Jun 2026 12:51:16 -0700 Subject: [PATCH 33/58] Unit tests compiling again --- include/bout/fieldops.hxx | 29 +---------- .../invert/laplace/test_laplace_cyclic.cxx | 27 +++++----- tests/unit/test_extras.cxx | 11 ++++ tests/unit/test_extras.hxx | 50 ++++++++++++++----- 4 files changed, 62 insertions(+), 55 deletions(-) diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 26e3b82cc5..302769ae70 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -187,38 +187,11 @@ struct BinaryExpr { const Region& region) //: lhs(static_cast(lhs)), rhs(static_cast(rhs)), : lhs(lhs), rhs(rhs), f(f), mesh(mesh), location(location), directions(directions), - regionID(regionID), indices(region.getIndices().size()) { + indices(region.getIndices().size()), regionID(regionID) { // Copy the region indices into the managed array for (int i = 0; i < indices.size(); ++i) { indices[i] = region.getIndices()[i].ind; } - //std::cout << "===PRE-sorting indices\n"; - //for (auto& ind : indices) { - // std::cout << ind << " "; - //} - //std::cout << "===end PRE\n"; - //std::sort(indices.begin(), indices.end(), - // [](const auto& a, const auto& b) { return a < b; }); - //std::cout << "===POST-sorting indices\n"; - //for (auto& ind : indices) { - // std::cout << ind << " "; - //} - //std::cout << "===end POST\n"; - //if (regionIndicesCache.find(static_cast(const_cast*>(®ion))) - // != regionIndicesCache.end()) { - // // If we have already computed the indices for this region, use them - // indices = - // regionIndicesCache[static_cast(const_cast*>(®ion))]; - //} else { - // // Otherwise, compute the indices and store them in the cache - // indices = Array(region.getIndices().size()); - // // Copy the region indices into the managed array - // for (int i = 0; i < indices.size(); ++i) { - // indices[i] = region.getIndices()[i].ind; - // } - // regionIndicesCache[static_cast(const_cast*>(®ion))] = - // indices; - //} } BinaryExpr& operator=(const BinaryExpr&) = delete; diff --git a/tests/unit/invert/laplace/test_laplace_cyclic.cxx b/tests/unit/invert/laplace/test_laplace_cyclic.cxx index a0d99f66c9..586dd80adf 100644 --- a/tests/unit/invert/laplace/test_laplace_cyclic.cxx +++ b/tests/unit/invert/laplace/test_laplace_cyclic.cxx @@ -10,6 +10,7 @@ #include "bout/invert_laplace.hxx" #include "gtest/gtest.h" +#include "bout/bout_types.hxx" #include "bout/derivs.hxx" #include "bout/difops.hxx" #include "bout/field2d.hxx" @@ -17,7 +18,6 @@ #include "bout/griddata.hxx" #include "bout/mesh.hxx" #include "bout/options.hxx" -#include "bout/vecops.hxx" #include "fake_mesh_fixture.hxx" @@ -29,14 +29,13 @@ class CyclicForwardOperator { CyclicForwardOperator(bool xin_neumann, bool xout_neumann) : inner_x_neumann(xin_neumann), outer_x_neumann(xout_neumann), - a(0.0), c1(1.0), c2(1.0), d(1.0), ex(0.0), ez(0.0) { - coords = mesh->getCoordinates(CELL_CENTER); - } + a(0.0), c1(1.0), c2(1.0), d(1.0), ex(0.0), ez(0.0), + coords(mesh->getCoordinates(CELL_CENTER)) {} - const Field3D operator()(Field3D& f) { - auto result = d * Delp2(f) - + (coords->g11 * DDX(f) + coords->g13 * DDZ(f)) * DDX(c2) / c1 + a * f - + ex * DDX(f) + ez * DDZ(f); + Field3D operator()(Field3D& f) { + Field3D result = d * Delp2(f) + + (coords->g11 * DDX(f) + coords->g13 * DDZ(f)) * DDX(c2) / c1 + + a * f + ex * DDX(f) + ez * DDZ(f); applyBoundaries(result, f); return result; } @@ -45,7 +44,7 @@ class CyclicForwardOperator { CyclicForwardOperator(); bool inner_x_neumann, outer_x_neumann; // If false then use Dirichlet conditions - void applyBoundaries(Field3D& newF, const Field3D& f) { + void applyBoundaries(Field3D& newF, const Field3D& f) const { BOUT_FOR(i, f.getMesh()->getRegion3D("RGN_INNER_X")) { if (inner_x_neumann) { newF[i] = (f[i.xp()] - f[i]) / coords->dx[i] / sqrt(coords->g_11[i]); @@ -86,14 +85,14 @@ class CyclicTest : public FakeMeshFixture, coef3.allocate(); BOUT_FOR(i, mesh->getRegion2D("RGN_ALL")) { - BoutReal x = i.x() / (BoutReal)nx - 0.5; - BoutReal y = i.y() / (BoutReal)ny - 0.5; + const BoutReal x = i.x() / (BoutReal)nx - 0.5; + const BoutReal y = i.y() / (BoutReal)ny - 0.5; coef2[i] = x + y; } BOUT_FOR(i, mesh->getRegion3D("RGN_ALL")) { - BoutReal x = i.x() / (BoutReal)nx - 0.5; - BoutReal y = i.y() / (BoutReal)ny - 0.5; - BoutReal z = i.z() / (BoutReal)nz - 0.5; + const BoutReal x = i.x() / (BoutReal)nx - 0.5; + const BoutReal y = i.y() / (BoutReal)ny - 0.5; + const BoutReal z = i.z() / (BoutReal)nz - 0.5; f3[i] = 1e3 * exp(-0.5 * sqrt(x * x + y * y + z * z) / sigmasq); coef3[i] = x + y + sin(2 * 3.14159265358979323846 * z); } diff --git a/tests/unit/test_extras.cxx b/tests/unit/test_extras.cxx index b1caf038d3..7cf7cc7c0d 100644 --- a/tests/unit/test_extras.cxx +++ b/tests/unit/test_extras.cxx @@ -1,4 +1,5 @@ #include "test_extras.hxx" +#include "fake_mesh_fixture.hxx" #include "bout/bout_types.hxx" #include "bout/field2d.hxx" #include "bout/field3d.hxx" @@ -42,3 +43,13 @@ void fillField(Field2D& f, std::vector> values) { } } } + +using TestExtrasFieldExpr = FakeMeshFixture; + +TEST_F(TestExtrasFieldExpr, IsFieldEqualHandlesBinaryExprOnEitherSide) { + const Field2D field{1.0}; + const Field2D expected{3.0}; + + EXPECT_TRUE(IsFieldEqual(field + 2.0, expected)); + EXPECT_TRUE(IsFieldEqual(expected, field + 2.0)); +} diff --git a/tests/unit/test_extras.hxx b/tests/unit/test_extras.hxx index dcc6cb3187..6038d981d4 100644 --- a/tests/unit/test_extras.hxx +++ b/tests/unit/test_extras.hxx @@ -7,15 +7,16 @@ #include #include #include +#include #include #include "bout/bout_types.hxx" #include "bout/field.hxx" +#include "bout/field2d.hxx" +#include "bout/field3d.hxx" +#include "bout/fieldperp.hxx" #include "bout/region.hxx" -class Field2D; -class Field3D; - static constexpr BoutReal BoutRealTolerance{1e-15}; // FFTs have a slightly looser tolerance than other functions static constexpr BoutReal FFTTolerance{1.e-12}; @@ -29,6 +30,23 @@ void fillField(Field2D& f, std::vector> values); using bout::utils::EnableIfField; +template +inline constexpr bool isFieldOrFieldExpr_v = + bout::utils::is_Field_v> || is_expr_field2d_v + || is_expr_field3d_v || is_expr_fieldperp_v; + +template >>> +auto evaluateFieldExpr(const T& field) -> const T& { + return field; +} + +template >> +auto evaluateFieldExpr(const BinaryExpr& expr) -> ResT { + return ResT{expr}; +} + /// Returns a field filled with the result of \p fill_function at each point /// Arbitrary arguments can be passed to the field constructor template > @@ -80,15 +98,19 @@ auto inline getIndexXYZ(const IndPerp& index) -> std::string { } /// Is \p field equal to \p reference, with a tolerance of \p tolerance? -template > +template && isFieldOrFieldExpr_v>> auto IsFieldEqual(const T& field, const U& reference, const std::string& region = "RGN_ALL", BoutReal tolerance = BoutRealTolerance) -> ::testing::AssertionResult { - for (auto i : field.getRegion(region)) { - if (fabs(field[i] - reference[i]) > tolerance) { + const auto& evaluated_field = evaluateFieldExpr(field); + const auto& evaluated_reference = evaluateFieldExpr(reference); + + for (auto i : evaluated_field.getRegion(region)) { + if (fabs(evaluated_field[i] - evaluated_reference[i]) > tolerance) { return ::testing::AssertionFailure() - << getFieldType(field) << "(" << getIndexXYZ(i) << ") == " << field[i] - << "; Expected: " << reference[i]; + << getFieldType(evaluated_field) << "(" << getIndexXYZ(i) + << ") == " << evaluated_field[i] << "; Expected: " << evaluated_reference[i]; } } return ::testing::AssertionSuccess(); @@ -96,15 +118,17 @@ auto IsFieldEqual(const T& field, const U& reference, /// Is \p field equal to \p reference, with a tolerance of \p tolerance? /// Overload for BoutReals -template > +template >> auto IsFieldEqual(const T& field, BoutReal reference, const std::string& region = "RGN_ALL", BoutReal tolerance = BoutRealTolerance) -> ::testing::AssertionResult { - for (auto i : field.getRegion(region)) { - if (fabs(field[i] - reference) > tolerance) { + const auto& evaluated_field = evaluateFieldExpr(field); + + for (auto i : evaluated_field.getRegion(region)) { + if (fabs(evaluated_field[i] - reference) > tolerance) { return ::testing::AssertionFailure() - << getFieldType(field) << "(" << getIndexXYZ(i) << ") == " << field[i] - << "; Expected: " << reference; + << getFieldType(evaluated_field) << "(" << getIndexXYZ(i) + << ") == " << evaluated_field[i] << "; Expected: " << reference; } } return ::testing::AssertionSuccess(); From a06e450975fadfc7777950400bffde0d767d3238 Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 18 Jun 2026 13:19:30 -0700 Subject: [PATCH 34/58] Fix warnings coming from Field constructors --- include/bout/field2d.hxx | 16 ++++++++++------ include/bout/field3d.hxx | 15 ++++++++++----- include/bout/fieldops.hxx | 5 ++--- include/bout/fieldperp.hxx | 16 ++++++++++------ 4 files changed, 32 insertions(+), 20 deletions(-) diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index b9c60aa02f..340b4b29fc 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -115,12 +115,9 @@ public: typename = std::enable_if_t<(is_expr_field2d_v && is_expr_field2d_v) || (is_expr_constant_v && is_expr_field2d_v) || (is_expr_field2d_v && is_expr_constant_v)>> - Field2D(const BinaryExpr& expr) { - Array data{expr.size()}; - expr.evaluate(&data[0]); - *this = std::move(Field2D{std::move(data), expr.getMesh(), expr.getLocation(), - expr.getDirections()}); - } + Field2D(const BinaryExpr& expr) + : Field2D(evaluateBinaryExpr(expr), expr.getMesh(), expr.getLocation(), + expr.getDirections()) {} /*! * Destructor */ @@ -351,6 +348,13 @@ public: BOUT_DEVICE inline BoutReal operator()(int i) const { return View()(i); } private: + template + static Array evaluateBinaryExpr(const BinaryExpr& expr) { + Array data{expr.size()}; + expr.evaluate(&data[0]); + return data; + } + /// Internal data array. Handles allocation/freeing of memory Array data; diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 81db064b41..c2448e82fb 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -201,11 +201,9 @@ public: ZDirectionType::Standard}); template || is_expr_field3d_v>> - Field3D(const BinaryExpr& expr) { - Array data{expr.size()}; - expr.evaluate(&data[0]); - *this = std::move(Field3D{std::move(data), expr.getMesh(), expr.getLocation(), - expr.getDirections()}); + Field3D(const BinaryExpr& expr) + : Field3D(evaluateBinaryExpr(expr), expr.getMesh(), expr.getLocation(), + expr.getDirections()) { setRegion(expr.getRegionID()); } /// Destructor @@ -643,6 +641,13 @@ protected: template > void _track(const T& change, std::string operation); void _track(const BoutReal& change, std::string operation); + + template + static Array evaluateBinaryExpr(const BinaryExpr& expr) { + Array data{expr.size()}; + expr.evaluate(&data[0]); + return data; + } }; // Non-member overloaded operators diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 302769ae70..7cb5c67643 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -185,9 +185,8 @@ struct BinaryExpr { BinaryExpr(const typename L::View& lhs, const typename R::View& rhs, Func f, Mesh* mesh, CELL_LOC location, DirectionTypes directions, std::optional regionID, const Region& region) - //: lhs(static_cast(lhs)), rhs(static_cast(rhs)), - : lhs(lhs), rhs(rhs), f(f), mesh(mesh), location(location), directions(directions), - indices(region.getIndices().size()), regionID(regionID) { + : lhs(lhs), rhs(rhs), indices(region.getIndices().size()), f(f), mesh(mesh), + location(location), directions(directions), regionID(regionID) { // Copy the region indices into the managed array for (int i = 0; i < indices.size(); ++i) { indices[i] = region.getIndices()[i].ind; diff --git a/include/bout/fieldperp.hxx b/include/bout/fieldperp.hxx index c97028d913..74414d01d6 100644 --- a/include/bout/fieldperp.hxx +++ b/include/bout/fieldperp.hxx @@ -93,12 +93,9 @@ public: template < typename ResT, typename L, typename R, typename Func, typename = std::enable_if_t<(is_expr_fieldperp_v && is_expr_fieldperp_v)>> - FieldPerp(const BinaryExpr& expr) { - Array data{expr.size()}; - expr.evaluate(&data[0]); - *this = std::move(FieldPerp{std::move(data), expr.getMesh(), expr.getLocation(), - /* yindex */ -1, expr.getDirections()}); - } + FieldPerp(const BinaryExpr& expr) + : FieldPerp(evaluateBinaryExpr(expr), expr.getMesh(), expr.getLocation(), + /* yindex */ -1, expr.getDirections()) {} ~FieldPerp() override = default; @@ -340,6 +337,13 @@ public: operator View() const { return View{const_cast(&data[0])}; } private: + template + static Array evaluateBinaryExpr(const BinaryExpr& expr) { + Array data{expr.size()}; + expr.evaluate(&data[0]); + return data; + } + /// The Y index at which this FieldPerp is defined int yindex{-1}; From 801890652a437ebd36d45cd4078845b7b5150712 Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 18 Jun 2026 13:47:46 -0700 Subject: [PATCH 35/58] Fix unit tests Expressions need to check the location of their arguments. --- include/bout/field.hxx | 22 ++++++++++++++++++++++ include/bout/field2d.hxx | 1 + include/bout/field3d.hxx | 2 ++ 3 files changed, 25 insertions(+) diff --git a/include/bout/field.hxx b/include/bout/field.hxx index e707b62816..6b745747f8 100644 --- a/include/bout/field.hxx +++ b/include/bout/field.hxx @@ -183,8 +183,30 @@ inline bool areFieldsCompatible(const Field& field1, const Field& field2) { #field2, toString((field2).getDirections())); \ } +#define ASSERT1_EXPR_COMPATIBLE(expr1, expr2) \ + if ((expr1).getLocation() != (expr2).getLocation()) { \ + throw BoutException("Error in {:s}:{:d}\nFields at different position:" \ + "`{:s}` at {:s}, `{:s}` at {:s}", \ + __FILE__, __LINE__, #expr1, toString((expr1).getLocation()), \ + #expr2, toString((expr2).getLocation())); \ + } \ + if ((expr1).getMesh() != (expr2).getMesh()) { \ + throw BoutException("Error in {:s}:{:d}\nFields are on different Meshes:" \ + "`{:s}` at {:p}, `{:s}` at {:p}", \ + __FILE__, __LINE__, #expr1, \ + static_cast((expr1).getMesh()), #expr2, \ + static_cast((expr2).getMesh())); \ + } \ + if (!areDirectionsCompatible((expr1).getDirections(), (expr2).getDirections())) { \ + throw BoutException("Error in {:s}:{:d}\nFields at different directions:" \ + "`{:s}` at {:s}, `{:s}` at {:s}", \ + __FILE__, __LINE__, #expr1, toString((expr1).getDirections()), \ + #expr2, toString((expr2).getDirections())); \ + } + #else #define ASSERT1_FIELDS_COMPATIBLE(field1, field2) ; +#define ASSERT1_EXPR_COMPATIBLE(expr1, expr2) ; #endif /// Return an empty shell field of some type derived from Field, with metadata diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index 340b4b29fc..c7d8252aaa 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -397,6 +397,7 @@ FIELD2D_FIELD2D_FIELD2D_OP(/, Div) std::enable_if_t && is_expr_field3d_v, \ BinaryExpr> \ operator OP_SYM(const L& lhs, const R& rhs) { \ + ASSERT1_EXPR_COMPATIBLE(lhs, rhs); \ auto regionID = rhs.getRegionID(); \ int mesh_nz = rhs.getMesh()->LocalNz; \ return BinaryExpr{ \ diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index c2448e82fb..e27a7600d2 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -666,6 +666,7 @@ FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); typename = std::enable_if_t && is_expr_field3d_v>> \ BinaryExpr operator OP_SYM(const L& lhs, \ const R& rhs) { \ + ASSERT1_EXPR_COMPATIBLE(lhs, rhs); \ auto regionID = \ lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); \ return BinaryExpr{ \ @@ -690,6 +691,7 @@ FIELD3D_FIELD3D_FIELD3D_OP(/, Div) std::enable_if_t && is_expr_field2d_v, \ BinaryExpr> \ operator OP_SYM(const L& lhs, const R& rhs) { \ + ASSERT1_EXPR_COMPATIBLE(lhs, rhs); \ auto regionID = lhs.getRegionID(); \ int mesh_nz = lhs.getMesh()->LocalNz; \ return BinaryExpr{ \ From 6cab892e52f384a5507e9781d72b040947b7fac1 Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 18 Jun 2026 14:43:29 -0700 Subject: [PATCH 36/58] BinaryExpr::operator[](IndType) indexing Allows evaluation of binary expressions at specific indices. Fixed boundary implementation in Petsc3DAMG. --- include/bout/fieldops.hxx | 5 ++ .../laplace/impls/petsc3damg/petsc3damg.cxx | 73 +++++++++++++------ tests/unit/test_extras.cxx | 16 ++++ 3 files changed, 70 insertions(+), 24 deletions(-) diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 7cb5c67643..7475661376 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -200,6 +200,11 @@ struct BinaryExpr { inline BoutReal operator()(int idx) const { return f(idx, lhs, rhs); // single‐pass fusion } + template + BOUT_HOST_DEVICE BOUT_FORCEINLINE auto operator[](const IndType& d) const + -> decltype(d.ind, BoutReal{}) { + return operator()(d.ind); + } inline int regionIdx(int idx) const { return indices[idx]; } //operator ResT() { return ResT{*this}; } diff --git a/src/invert/laplace/impls/petsc3damg/petsc3damg.cxx b/src/invert/laplace/impls/petsc3damg/petsc3damg.cxx index 3be5e43e63..ddd0b15b7e 100644 --- a/src/invert/laplace/impls/petsc3damg/petsc3damg.cxx +++ b/src/invert/laplace/impls/petsc3damg/petsc3damg.cxx @@ -120,39 +120,64 @@ LaplacePetsc3dAmg::LaplacePetsc3dAmg(Options* opt, const CELL_LOC loc, Mesh* mes // Set up boundary conditions in operator const bool inner_X_neumann = isInnerBoundaryFlagSet(INVERT_AC_GRAD); - const auto inner_X_BC = inner_X_neumann ? -1. / coords->dx / sqrt(coords->g_11) : 0.5; - const auto inner_X_BC_plus = inner_X_neumann ? -inner_X_BC : 0.5; - - BOUT_FOR_SERIAL(i, indexer->getRegionInnerX()) { - operator3D(i, i) = inner_X_BC[i]; - operator3D(i, i.xp()) = inner_X_BC_plus[i]; + if (inner_X_neumann) { + // This is a BinaryExpr that is only evaluated when needed + const auto inner_X_BC = -1. / coords->dx / sqrt(coords->g_11); + BOUT_FOR_SERIAL(i, indexer->getRegionInnerX()) { + const BoutReal bc = inner_X_BC[i]; + operator3D(i, i) = bc; + operator3D(i, i.xp()) = -bc; + } + } else { + BOUT_FOR_SERIAL(i, indexer->getRegionInnerX()) { + operator3D(i, i) = 0.5; + operator3D(i, i.xp()) = 0.5; + } } const bool outer_X_neumann = isOuterBoundaryFlagSet(INVERT_AC_GRAD); - const auto outer_X_BC = outer_X_neumann ? 1. / coords->dx / sqrt(coords->g_11) : 0.5; - const auto outer_X_BC_minus = outer_X_neumann ? -outer_X_BC : 0.5; - - BOUT_FOR_SERIAL(i, indexer->getRegionOuterX()) { - operator3D(i, i) = outer_X_BC[i]; - operator3D(i, i.xm()) = outer_X_BC_minus[i]; + if (outer_X_neumann) { + const auto outer_X_BC = 1. / coords->dx / sqrt(coords->g_11); + BOUT_FOR_SERIAL(i, indexer->getRegionOuterX()) { + const BoutReal bc = outer_X_BC[i]; + operator3D(i, i) = bc; + operator3D(i, i.xm()) = -bc; + } + } else { + BOUT_FOR_SERIAL(i, indexer->getRegionOuterX()) { + operator3D(i, i) = 0.5; + operator3D(i, i.xm()) = 0.5; + } } const bool lower_Y_neumann = flagSet(lower_boundary_flags, INVERT_AC_GRAD); - const auto lower_Y_BC = lower_Y_neumann ? -1. / coords->dy / sqrt(coords->g_22) : 0.5; - const auto lower_Y_BC_plus = lower_Y_neumann ? -lower_Y_BC : 0.5; - - BOUT_FOR_SERIAL(i, indexer->getRegionLowerY()) { - operator3D(i, i) = lower_Y_BC[i]; - operator3D(i, i.yp()) = lower_Y_BC_plus[i]; + if (lower_Y_neumann) { + const auto lower_Y_BC = -1. / coords->dy / sqrt(coords->g_22); + BOUT_FOR_SERIAL(i, indexer->getRegionLowerY()) { + const BoutReal bc = lower_Y_BC[i]; + operator3D(i, i) = bc; + operator3D(i, i.yp()) = -bc; + } + } else { + BOUT_FOR_SERIAL(i, indexer->getRegionLowerY()) { + operator3D(i, i) = 0.5; + operator3D(i, i.yp()) = 0.5; + } } const bool upper_Y_neumann = flagSet(upper_boundary_flags, INVERT_AC_GRAD); - const auto upper_Y_BC = upper_Y_neumann ? 1. / coords->dy / sqrt(coords->g_22) : 0.5; - const auto upper_Y_BC_minus = upper_Y_neumann ? -upper_Y_BC : 0.5; - - BOUT_FOR_SERIAL(i, indexer->getRegionUpperY()) { - operator3D(i, i) = upper_Y_BC[i]; - operator3D(i, i.ym()) = upper_Y_BC_minus[i]; + if (upper_Y_neumann) { + const auto upper_Y_BC = 1. / coords->dy / sqrt(coords->g_22); + BOUT_FOR_SERIAL(i, indexer->getRegionUpperY()) { + const BoutReal bc = upper_Y_BC[i]; + operator3D(i, i) = bc; + operator3D(i, i.ym()) = -bc; + } + } else { + BOUT_FOR_SERIAL(i, indexer->getRegionUpperY()) { + operator3D(i, i) = 0.5; + operator3D(i, i.ym()) = 0.5; + } } } diff --git a/tests/unit/test_extras.cxx b/tests/unit/test_extras.cxx index 7cf7cc7c0d..f096292ed3 100644 --- a/tests/unit/test_extras.cxx +++ b/tests/unit/test_extras.cxx @@ -53,3 +53,19 @@ TEST_F(TestExtrasFieldExpr, IsFieldEqualHandlesBinaryExprOnEitherSide) { EXPECT_TRUE(IsFieldEqual(field + 2.0, expected)); EXPECT_TRUE(IsFieldEqual(expected, field + 2.0)); } + +TEST_F(TestExtrasFieldExpr, BinaryExprCanBeIndexedWithRegionIndex) { + const Field3D lhs{ + makeField([](const Ind3D& i) { return static_cast(i.x()); })}; + const Field3D rhs{ + makeField([](const Ind3D& i) { return static_cast(i.y()); })}; + + const auto expr = lhs + 2.0 * rhs; + Field3D result{emptyFrom(lhs)}; + + BOUT_FOR_SERIAL(i, result.getRegion("RGN_ALL")) { result[i] = expr[i]; } + + BOUT_FOR(i, result.getRegion("RGN_ALL")) { + EXPECT_DOUBLE_EQ(result[i], lhs[i] + 2.0 * rhs[i]); + } +} From 1caebb6874a6f85af3e2c359f9caed9ccdc9d1aa Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 18 Jun 2026 15:25:20 -0700 Subject: [PATCH 37/58] SQ and mean for BinaryExpr `SQ` ensures that its argument is only evaluated once. `mean` is defined analogously to `min` and `max`: It evaluates the expression and then calls the existing field operator. --- include/bout/field.hxx | 63 +++++++++++++++++++ include/bout/fieldops.hxx | 6 ++ include/bout/utils.hxx | 5 +- tests/unit/field/test_field2d.cxx | 13 ++++ tests/unit/field/test_field3d.cxx | 30 +++++++++ tests/unit/field/test_fieldperp.cxx | 8 +++ .../laplace/test_laplace_petsc3damg.cxx | 6 +- 7 files changed, 127 insertions(+), 4 deletions(-) diff --git a/include/bout/field.hxx b/include/bout/field.hxx index 6b745747f8..b0477d756e 100644 --- a/include/bout/field.hxx +++ b/include/bout/field.hxx @@ -496,6 +496,12 @@ inline BoutReal mean(const T& f, bool allpe = false, return result / static_cast(count); } +template +inline BoutReal mean(const BinaryExpr& f, bool allpe = false, + const std::string& rgn = "RGN_NOBNDRY") { + return mean(ResT{f}, allpe, rgn); +} + /// Exponent: pow(lhs, lhs) is \p lhs raised to the power of \p rhs /// /// This loops over the entire domain, including guard/boundary cells by @@ -607,6 +613,63 @@ class Field3DParallel; } #endif +namespace bout::op { +struct Square { + template + BOUT_HOST_DEVICE BoutReal operator()(int idx, const LView& L, const RView&) const { + const BoutReal value = L(idx); + return ::SQ(value); + } +}; +}; // namespace bout::op + +template > +inline auto SQ(const T& f, const std::string& rgn = "RGN_ALL") { + if constexpr (std::is_same_v) { + checkData(f); + T result{emptyFrom(f)}; + if (f.hasParallelSlices() and !result.hasParallelSlices()) { + result.splitParallelSlices(); + } + BOUT_FOR(d, result.getRegion(rgn)) { result[d] = ::SQ(f[d]); } + for (size_t i = 0; i < f.numberParallelSlices(); ++i) { + result.yup(i) = SQ(f.yup(i), rgn); + result.ydown(i) = SQ(f.ydown(i), rgn); + } + result.name = std::string("SQ(") + f.name + std::string(")"); + checkData(result); + return result; + } else { + return BinaryExpr{static_cast(f), + static_cast(f), + bout::op::Square{}, + f.getMesh(), + f.getLocation(), + f.getDirections(), + std::nullopt, + f.getRegion(rgn)}; + } +} + +template +inline auto SQ(const BinaryExpr& f) { + return BinaryExpr, BinaryExpr, + bout::op::Square>{ + static_cast::View>(f), + static_cast::View>(f), + bout::op::Square{}, + f.getMesh(), + f.getLocation(), + f.getDirections(), + f.getRegionID(), + f.indices}; +} + +template +inline auto SQ(const BinaryExpr& f, const std::string& rgn) { + return SQ(ResT{f}, rgn); +} + /// Square root of \p f over region \p rgn /// /// This loops over the entire domain, including guard/boundary cells by diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 7475661376..5e662b24d4 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -193,6 +193,12 @@ struct BinaryExpr { } } + BinaryExpr(const typename L::View& lhs, const typename R::View& rhs, Func f, Mesh* mesh, + CELL_LOC location, DirectionTypes directions, std::optional regionID, + const Array& indices) + : lhs(lhs), rhs(rhs), indices(indices), f(f), mesh(mesh), location(location), + directions(directions), regionID(regionID) {} + BinaryExpr& operator=(const BinaryExpr&) = delete; BinaryExpr& operator=(BinaryExpr&&) = delete; diff --git a/include/bout/utils.hxx b/include/bout/utils.hxx index 9b24311c33..6fb7d38dc2 100644 --- a/include/bout/utils.hxx +++ b/include/bout/utils.hxx @@ -47,6 +47,8 @@ #include #include +class Field; + #ifdef _MSC_VER // finite is not actually standard C++, it's a BSD extention for C inline auto finite(BoutReal x) -> bool { return std::isfinite(x); } @@ -436,7 +438,8 @@ inline BoutReal randomu() { * Calculate the square of a variable \p t * i.e. t * t */ -template +template >>> inline auto SQ(const T& t) { return t * t; } diff --git a/tests/unit/field/test_field2d.cxx b/tests/unit/field/test_field2d.cxx index cf9a0c1f25..67cf1d6c61 100644 --- a/tests/unit/field/test_field2d.cxx +++ b/tests/unit/field/test_field2d.cxx @@ -1175,6 +1175,19 @@ TEST_F(Field2DTest, Sqrt) { EXPECT_TRUE(IsFieldEqual(sqrt(field), 4.0)); } +TEST_F(Field2DTest, SQExpressionUsesSquareOp) { + Field2D field; + + field = 2.0; + const auto expr = field + 1.0; + + EXPECT_TRUE( + (std::is_same_v, + BinaryExpr, + std::decay_t, bout::op::Square>>)); + EXPECT_TRUE(IsFieldEqual(SQ(expr), 9.0)); +} + TEST_F(Field2DTest, Abs) { Field2D field; diff --git a/tests/unit/field/test_field3d.cxx b/tests/unit/field/test_field3d.cxx index 7672ec7dae..ba8f25ba30 100644 --- a/tests/unit/field/test_field3d.cxx +++ b/tests/unit/field/test_field3d.cxx @@ -1949,6 +1949,36 @@ TEST_F(Field3DTest, Sqrt) { EXPECT_TRUE(IsFieldEqual(sqrt(field), 4.0)); } +TEST_F(Field3DTest, SQExpressionUsesSquareOp) { + Field3D field; + + field = 2.0; + const auto expr = field + 1.0; + + EXPECT_TRUE( + (std::is_same_v, + BinaryExpr, + std::decay_t, bout::op::Square>>)); + EXPECT_TRUE(IsFieldEqual(SQ(expr), 9.0)); +} + +TEST_F(Field3DTest, SQField3DParallelPreservesParallelSlices) { + Field3DParallel field; + + field = 2.0; + field.splitParallelSlices(); + field.yup() = 3.0; + field.ydown() = 4.0; + + const auto squared = SQ(field); + + EXPECT_TRUE((std::is_same_v, Field3DParallel>)); + EXPECT_TRUE(squared.hasParallelSlices()); + EXPECT_TRUE(IsFieldEqual(squared, 4.0)); + EXPECT_TRUE(IsFieldEqual(squared.yup(), 9.0)); + EXPECT_TRUE(IsFieldEqual(squared.ydown(), 16.0)); +} + TEST_F(Field3DTest, Abs) { Field3D field; diff --git a/tests/unit/field/test_fieldperp.cxx b/tests/unit/field/test_fieldperp.cxx index 8caafa96e4..82bf39f880 100644 --- a/tests/unit/field/test_fieldperp.cxx +++ b/tests/unit/field/test_fieldperp.cxx @@ -1577,6 +1577,14 @@ TEST_F(FieldPerpTest, Sqrt) { EXPECT_TRUE(IsFieldEqual(sqrt(field), 4.0)); } +TEST_F(FieldPerpTest, SQFieldPerp) { + FieldPerp field; + field.setIndex(0); + + field = 3.0; + EXPECT_TRUE(IsFieldEqual(SQ(field), 9.0)); +} + TEST_F(FieldPerpTest, Abs) { FieldPerp field; field.setIndex(0); diff --git a/tests/unit/invert/laplace/test_laplace_petsc3damg.cxx b/tests/unit/invert/laplace/test_laplace_petsc3damg.cxx index 157ec22c84..846cb9107f 100644 --- a/tests/unit/invert/laplace/test_laplace_petsc3damg.cxx +++ b/tests/unit/invert/laplace/test_laplace_petsc3damg.cxx @@ -39,9 +39,9 @@ class ForwardOperator { } const Field3D operator()(Field3D& f) { - auto result = d * Laplace_perp(f, CELL_DEFAULT, "free", "RGN_NOY") - + (Grad(f) * Grad(c2) - DDY(c2) * DDY(f) / coords->g_22) / c1 + a * f - + ex * DDX(f) + ez * DDZ(f); + Field3D result = d * Laplace_perp(f, CELL_DEFAULT, "free", "RGN_NOY") + + (Grad(f) * Grad(c2) - DDY(c2) * DDY(f) / coords->g_22) / c1 + a * f + + ex * DDX(f) + ez * DDZ(f); applyBoundaries(result, f); return result; } From 4e19f8e329426947e36293b212856e3936c2eb0a Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 18 Jun 2026 16:21:55 -0700 Subject: [PATCH 38/58] Region::getLinearIndices Caches the Array indices of the region. This will enable kernels to avoid rebuilding the index array every time. --- include/bout/region.hxx | 36 ++++++++++++++------ tests/unit/include/bout/test_region.cxx | 45 +++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 11 deletions(-) diff --git a/include/bout/region.hxx b/include/bout/region.hxx index e00ad6d41d..88829fc65e 100644 --- a/include/bout/region.hxx +++ b/include/bout/region.hxx @@ -49,6 +49,7 @@ #include #include +#include "bout/array.hxx" #include "bout/assert.hxx" #include "bout/bout_types.hxx" #include "bout/boutexception.hxx" @@ -170,8 +171,8 @@ struct SpecificInd { int ny = -1, nz = -1; ///< Sizes of y and z dimensions SpecificInd() = default; - SpecificInd(int i, int ny, int nz) : ind(i), ny(ny), nz(nz){}; - explicit SpecificInd(int i) : ind(i){}; + SpecificInd(int i, int ny, int nz) : ind(i), ny(ny), nz(nz) {}; + explicit SpecificInd(int i) : ind(i) {}; /// Allow explicit conversion to an int explicit operator int() const { return ind; } @@ -491,10 +492,9 @@ template class Region { // Following prevents a Region being created with anything other // than Ind2D, Ind3D or IndPerp as template type - static_assert( - std::is_base_of_v< - Ind2D, T> || std::is_base_of_v || std::is_base_of_v, - "Region must be templated with one of IndPerp, Ind2D or Ind3D"); + static_assert(std::is_base_of_v || std::is_base_of_v + || std::is_base_of_v, + "Region must be templated with one of IndPerp, Ind2D or Ind3D"); public: using data_type = T; @@ -570,7 +570,7 @@ public: }; Region(RegionIndices& indices, int maxregionblocksize = MAXREGIONBLOCKSIZE) - : indices(indices), blocks(getContiguousBlocks(maxregionblocksize)){}; + : indices(indices), blocks(getContiguousBlocks(maxregionblocksize)) {}; // We need to first set the blocks, and only after that call getRegionIndices. // Do not put in the member initialisation @@ -595,17 +595,28 @@ public: const ContiguousBlocks& getBlocks() const { return blocks; }; const RegionIndices& getIndices() const { return indices; }; + const Array& getLinearIndices() const { + if (linearIndices.empty()) { + linearIndices = Array(indices.size()); + for (size_type i = 0; i < indices.size(); ++i) { + linearIndices[i] = indices[i].ind; + } + } + return linearIndices; + } /// Set the indices and ensure blocks updated void setIndices(RegionIndices& indicesIn, int maxregionblocksize = MAXREGIONBLOCKSIZE) { indices = indicesIn; blocks = getContiguousBlocks(maxregionblocksize); + invalidateLinearIndices(); }; /// Set the blocks and ensure indices updated void setBlocks(ContiguousBlocks& blocksIn) { blocks = blocksIn; indices = getRegionIndices(); + invalidateLinearIndices(); }; /// Return a new Region that has the same indices as this one but @@ -829,10 +840,13 @@ public: // sorted this would prevent this usage. private: - RegionIndices indices; //< Flattened indices - ContiguousBlocks blocks; //< Contiguous sections of flattened indices - int ny = -1; //< Size of y dimension - int nz = -1; //< Size of z dimension + RegionIndices indices; //< Flattened indices + ContiguousBlocks blocks; //< Contiguous sections of flattened indices + int ny = -1; //< Size of y dimension + int nz = -1; //< Size of z dimension + mutable Array linearIndices; //< Cached flattened integer indices + + void invalidateLinearIndices() const { linearIndices.clear(); } /// Helper function to create a RegionIndices, given the start and end /// points in x, y, z, and the total y, z lengths diff --git a/tests/unit/include/bout/test_region.cxx b/tests/unit/include/bout/test_region.cxx index 00137c1ce7..fe66524735 100644 --- a/tests/unit/include/bout/test_region.cxx +++ b/tests/unit/include/bout/test_region.cxx @@ -111,6 +111,51 @@ TEST_F(RegionTest, regionFromIndices) { } } +TEST_F(RegionTest, getLinearIndices) { + Region region(0, mesh->LocalNx - 1, 0, mesh->LocalNy - 1, 0, mesh->LocalNz - 1, + mesh->LocalNy, mesh->LocalNz); + + const auto& indices = region.getIndices(); + const auto& linearIndices = region.getLinearIndices(); + + ASSERT_EQ(linearIndices.size(), indices.size()); + for (int i = 0; i < linearIndices.size(); ++i) { + EXPECT_EQ(linearIndices[i], indices[i].ind); + } +} + +TEST_F(RegionTest, getLinearIndicesUpdatedAfterSetIndices) { + Region::RegionIndices indicesIn{{0, 1, 1}, {2, 1, 1}, {4, 1, 1}}; + Region region(indicesIn); + + const auto& initialLinearIndices = region.getLinearIndices(); + ASSERT_EQ(initialLinearIndices.size(), 3); + EXPECT_EQ(initialLinearIndices[0], 0); + EXPECT_EQ(initialLinearIndices[1], 2); + EXPECT_EQ(initialLinearIndices[2], 4); + + Region::RegionIndices newIndices{{1, 1, 1}, {3, 1, 1}}; + region.setIndices(newIndices); + + const auto& updatedLinearIndices = region.getLinearIndices(); + ASSERT_EQ(updatedLinearIndices.size(), 2); + EXPECT_EQ(updatedLinearIndices[0], 1); + EXPECT_EQ(updatedLinearIndices[1], 3); +} + +TEST_F(RegionTest, getLinearIndicesUpdatedAfterSetBlocks) { + Region::ContiguousBlocks blocks{{Ind3D{1, 1, 1}, Ind3D{3, 1, 1}}, + {Ind3D{5, 1, 1}, Ind3D{6, 1, 1}}}; + Region region; + region.setBlocks(blocks); + + const auto& linearIndices = region.getLinearIndices(); + ASSERT_EQ(linearIndices.size(), 3); + EXPECT_EQ(linearIndices[0], 1); + EXPECT_EQ(linearIndices[1], 2); + EXPECT_EQ(linearIndices[2], 5); +} + TEST_F(RegionTest, regionFromBlocks) { Region region(0, mesh->LocalNx - 1, 0, mesh->LocalNy - 1, 0, mesh->LocalNz - 1, mesh->LocalNy, mesh->LocalNz); From d67672618933adf3d4929d6c0b5e17aac95dfadf Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 18 Jun 2026 17:10:06 -0700 Subject: [PATCH 39/58] Fix BinaryExpr::operator[] for Field2D expressions Need to convert Ind3D linear indices into Ind2D indices. --- include/bout/fieldops.hxx | 6 +++++- src/invert/laplace/impls/petsc3damg/petsc3damg.cxx | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 5e662b24d4..ad3faa2ff7 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -209,7 +209,11 @@ struct BinaryExpr { template BOUT_HOST_DEVICE BOUT_FORCEINLINE auto operator[](const IndType& d) const -> decltype(d.ind, BoutReal{}) { - return operator()(d.ind); + if constexpr (std::is_same_v) { + return operator()(d.ind / d.nz); + } else { + return operator()(d.ind); + } } inline int regionIdx(int idx) const { return indices[idx]; } diff --git a/src/invert/laplace/impls/petsc3damg/petsc3damg.cxx b/src/invert/laplace/impls/petsc3damg/petsc3damg.cxx index ddd0b15b7e..76b9b94e9a 100644 --- a/src/invert/laplace/impls/petsc3damg/petsc3damg.cxx +++ b/src/invert/laplace/impls/petsc3damg/petsc3damg.cxx @@ -4,9 +4,9 @@ * Using PETSc Solvers * ************************************************************************** - * Copyright 2013 J. Buchanan, J.Omotani + * Copyright 2013 - 2026 BOUT++ contributors * - * Contact: Ben Dudson, bd512@york.ac.uk + * Contact: Ben Dudson, dudson2@llnl.gov * * This file is part of BOUT++. * From 0b9dbf681c4f628698e5acbd076530ae19cbaa54 Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 18 Jun 2026 17:12:17 -0700 Subject: [PATCH 40/58] Reductions on BinaryExpr: Min/Max/Mean Perform reductions over binary expressions without having to allocate an intermediate field. --- include/bout/field.hxx | 43 +++++++- include/bout/fieldops.hxx | 149 +++++++++++++++++++++++++++- tests/unit/field/test_field2d.cxx | 30 ++++++ tests/unit/field/test_field3d.cxx | 48 +++++++++ tests/unit/field/test_fieldperp.cxx | 16 +++ tests/unit/test_extras.cxx | 13 +++ 6 files changed, 293 insertions(+), 6 deletions(-) diff --git a/include/bout/field.hxx b/include/bout/field.hxx index b0477d756e..ad365e8ec1 100644 --- a/include/bout/field.hxx +++ b/include/bout/field.hxx @@ -368,7 +368,19 @@ inline BoutReal min(const T& f, bool allpe = false, template inline BoutReal min(const BinaryExpr& f, bool allpe = false, const std::string& rgn = "RGN_NOBNDRY") { - return min(ResT{f}, allpe, rgn); + const auto& region = f.getMesh()->template getRegion(rgn); + const auto reduction_view = + makeReductionView(static_cast::View>(f), + region.getLinearIndices()); + BoutReal result = + bout::reduce::Min::finalize(reduceExpr(reduction_view)); + + if (allpe) { + BoutReal localresult = result; + MPI_Allreduce(&localresult, &result, 1, MPI_DOUBLE, MPI_MIN, BoutComm::get()); + } + + return result; } /// Returns true if all elements of \p f over \p region are equal. By @@ -458,7 +470,19 @@ inline BoutReal max(const T& f, bool allpe = false, template inline BoutReal max(const BinaryExpr& f, bool allpe = false, const std::string& rgn = "RGN_NOBNDRY") { - return max(ResT{f}, allpe, rgn); + const auto& region = f.getMesh()->template getRegion(rgn); + const auto reduction_view = + makeReductionView(static_cast::View>(f), + region.getLinearIndices()); + BoutReal result = + bout::reduce::Max::finalize(reduceExpr(reduction_view)); + + if (allpe) { + BoutReal localresult = result; + MPI_Allreduce(&localresult, &result, 1, MPI_DOUBLE, MPI_MAX, BoutComm::get()); + } + + return result; } /// Mean of \p f, excluding the boundary/guard cells by default (can @@ -499,7 +523,20 @@ inline BoutReal mean(const T& f, bool allpe = false, template inline BoutReal mean(const BinaryExpr& f, bool allpe = false, const std::string& rgn = "RGN_NOBNDRY") { - return mean(ResT{f}, allpe, rgn); + const auto& region = f.getMesh()->template getRegion(rgn); + const auto reduction_view = + makeReductionView(static_cast::View>(f), + region.getLinearIndices()); + auto state = reduceExpr(reduction_view); + + if (allpe) { + BoutReal localsum = state.sum; + int localcount = state.count; + MPI_Allreduce(&localsum, &state.sum, 1, MPI_DOUBLE, MPI_SUM, BoutComm::get()); + MPI_Allreduce(&localcount, &state.count, 1, MPI_INT, MPI_SUM, BoutComm::get()); + } + + return bout::reduce::Mean::finalize(state); } /// Exponent: pow(lhs, lhs) is \p lhs raised to the power of \p rhs diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index ad3faa2ff7..4b5eab2f24 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -5,9 +5,9 @@ #include "bout/array.hxx" #include "bout/bout_types.hxx" +#include #include #include -#include #include #if BOUT_HAS_CUDA @@ -101,8 +101,91 @@ struct Div { } }; }; // namespace op + +namespace reduce { + +struct MinState { + BoutReal value; +}; + +struct MaxState { + BoutReal value; +}; + +struct MeanState { + BoutReal sum; + int count; +}; + +struct Min { + using State = MinState; + + BOUT_HOST_DEVICE static State identity() { + return {std::numeric_limits::infinity()}; + } + BOUT_HOST_DEVICE static void accumulate(State& state, BoutReal value) { + state.value = value < state.value ? value : state.value; + } + BOUT_HOST_DEVICE static void combine(State& state, const State& other) { + state.value = other.value < state.value ? other.value : state.value; + } + static BoutReal finalize(const State& state) { return state.value; } +}; + +struct Max { + using State = MaxState; + + BOUT_HOST_DEVICE static State identity() { + return {-std::numeric_limits::infinity()}; + } + BOUT_HOST_DEVICE static void accumulate(State& state, BoutReal value) { + state.value = value > state.value ? value : state.value; + } + BOUT_HOST_DEVICE static void combine(State& state, const State& other) { + state.value = other.value > state.value ? other.value : state.value; + } + static BoutReal finalize(const State& state) { return state.value; } +}; + +struct Mean { + using State = MeanState; + + BOUT_HOST_DEVICE static State identity() { return {0.0, 0}; } + BOUT_HOST_DEVICE static void accumulate(State& state, BoutReal value) { + state.sum += value; + state.count += 1; + } + BOUT_HOST_DEVICE static void combine(State& state, const State& other) { + state.sum += other.sum; + state.count += other.count; + } + static BoutReal finalize(const State& state) { + return state.sum / static_cast(state.count); + } +}; + +} // namespace reduce }; // namespace bout +template +struct ReductionView { + ExprView expr; + const int* indices; + int num_indices; + + BOUT_HOST_DEVICE BOUT_FORCEINLINE int size() const { return num_indices; } + BOUT_HOST_DEVICE BOUT_FORCEINLINE BoutReal valueAtRegionPos(int idx) const { + return expr(indices[idx]); + } +}; + +template +ReductionView makeReductionView(const ExprView& expr, + const Array& indices) { + return ReductionView{expr, indices.size() > 0 ? &indices[0] : nullptr, + indices.size()}; +} + #if BOUT_HAS_CUDA && defined(__CUDACC__) template __global__ void __launch_bounds__(THREADS) evaluatorExpr(BoutReal* out, const Expr expr) { @@ -129,9 +212,39 @@ __global__ void __launch_bounds__(THREADS) evaluatorExpr(BoutReal* out, const Ex // out[idx] = expr(idx); // single‐pass fusion //} } -#endif -inline std::unordered_map> regionIndicesCache; +template +__global__ void __launch_bounds__(THREADS) + reducerExpr(typename Reducer::State* partials, const ExprView expr) { + using State = typename Reducer::State; + + __shared__ State shared[THREADS]; + + const int tid = threadIdx.x; + const int global = blockIdx.x * blockDim.x + tid; + const int stride = blockDim.x * gridDim.x; + + State local = Reducer::identity(); + + for (int i = global; i < expr.size(); i += stride) { + Reducer::accumulate(local, expr.valueAtRegionPos(i)); + } + + shared[tid] = local; + __syncthreads(); + + for (int offset = blockDim.x / 2; offset > 0; offset /= 2) { + if (tid < offset) { + Reducer::combine(shared[tid], shared[tid + offset]); + } + __syncthreads(); + } + + if (tid == 0) { + partials[blockIdx.x] = shared[0]; + } +} +#endif #if BOUT_HAS_CUDA && defined(__CUDACC__) struct StreamsRAII { @@ -169,6 +282,36 @@ struct StreamsRAII { inline struct StreamsRAII streams; #endif +template +auto reduceExpr(const ExprView& expr_view) -> typename Reducer::State { + using State = typename Reducer::State; + + ASSERT1(expr_view.size() > 0); + +#if BOUT_HAS_CUDA && defined(__CUDACC__) + cudaStream_t stream = streams.get(); + int blocks = (expr_view.size() + THREADS - 1) / THREADS; + blocks = blocks < 1024 ? blocks : 1024; + Array partials(blocks); + + reducerExpr<<>>(&partials[0], expr_view); + cudaStreamSynchronize(stream); + streams.put(stream); + + State result = Reducer::identity(); + for (int i = 0; i < blocks; ++i) { + Reducer::combine(result, partials[i]); + } + return result; +#else + State result = Reducer::identity(); + for (int i = 0; i < expr_view.size(); ++i) { + Reducer::accumulate(result, expr_view.valueAtRegionPos(i)); + } + return result; +#endif +} + template struct BinaryExpr { typename L::View lhs; diff --git a/tests/unit/field/test_field2d.cxx b/tests/unit/field/test_field2d.cxx index 67cf1d6c61..9eebb276ba 100644 --- a/tests/unit/field/test_field2d.cxx +++ b/tests/unit/field/test_field2d.cxx @@ -1311,6 +1311,21 @@ TEST_F(Field2DTest, Min) { EXPECT_EQ(min(field, true, "RGN_ALL"), -99.0); } +TEST_F(Field2DTest, MinBinaryExpr) { + Field2D field; + + field = 50.0; + field(0, 0) = -99.0; + field(1, 1) = 60.0; + field(1, 2) = 40.0; + field(2, 4) = 99.0; + + const auto expr = field / 2.0 - 5.0; + + EXPECT_EQ(min(expr, false), 15.0); + EXPECT_EQ(min(expr, false, "RGN_ALL"), -54.5); +} + TEST_F(Field2DTest, Max) { Field2D field; @@ -1328,6 +1343,21 @@ TEST_F(Field2DTest, Max) { EXPECT_EQ(max(field, true, "RGN_ALL"), 99.0); } +TEST_F(Field2DTest, MaxBinaryExpr) { + Field2D field; + + field = 50.0; + field(0, 0) = -99.0; + field(1, 1) = 40.0; + field(1, 2) = 60.0; + field(2, 4) = 99.0; + + const auto expr = field / 2.0 - 5.0; + + EXPECT_EQ(max(expr, false), 25.0); + EXPECT_EQ(max(expr, false, "RGN_ALL"), 44.5); +} + TEST_F(Field2DTest, Swap) { WithQuietOutput quiet{output_info}; diff --git a/tests/unit/field/test_field3d.cxx b/tests/unit/field/test_field3d.cxx index ba8f25ba30..2941bf3d89 100644 --- a/tests/unit/field/test_field3d.cxx +++ b/tests/unit/field/test_field3d.cxx @@ -2102,6 +2102,21 @@ TEST_F(Field3DTest, Min) { EXPECT_EQ(min(field, true, "RGN_ALL"), -99.0); } +TEST_F(Field3DTest, MinBinaryExpr) { + Field3D field; + + field = 50.0; + field(0, 0, 0) = -99.0; + field(1, 1, 1) = 60.0; + field(1, 2, 2) = 40.0; + field(2, 4, 3) = 99.0; + + const auto expr = field / 2.0 - 5.0; + + EXPECT_EQ(min(expr, false), 15.0); + EXPECT_EQ(min(expr, false, "RGN_ALL"), -54.5); +} + TEST_F(Field3DTest, Max) { Field3D field; @@ -2119,6 +2134,21 @@ TEST_F(Field3DTest, Max) { EXPECT_EQ(max(field, true, "RGN_ALL"), 99.0); } +TEST_F(Field3DTest, MaxBinaryExpr) { + Field3D field; + + field = 50.0; + field(0, 0, 0) = -99.0; + field(1, 1, 1) = 40.0; + field(1, 2, 2) = 60.0; + field(2, 4, 3) = 99.0; + + const auto expr = field / 2.0 - 5.0; + + EXPECT_EQ(max(expr, false), 25.0); + EXPECT_EQ(max(expr, false, "RGN_ALL"), 44.5); +} + TEST_F(Field3DTest, Mean) { Field3D field; @@ -2138,6 +2168,24 @@ TEST_F(Field3DTest, Mean) { EXPECT_EQ(mean(field, true, "RGN_ALL"), mean_value_all); } +TEST_F(Field3DTest, MeanBinaryExpr) { + Field3D field; + + field = 50.0; + field(0, 0, 0) = 1.0; + field(1, 1, 1) = 40.0; + field(1, 2, 2) = 60.0; + field(2, 4, 3) = 109.0; + + const int npoints_all = nx * ny * nz; + const BoutReal mean_value_nobndry = 103.0; + const BoutReal mean_value_all = 103.0 + 20.0 / npoints_all; + const auto expr = field * 2.0 + 3.0; + + EXPECT_EQ(mean(expr, false), mean_value_nobndry); + EXPECT_EQ(mean(expr, false, "RGN_ALL"), mean_value_all); +} + TEST_F(Field3DTest, DC) { Field3D field; diff --git a/tests/unit/field/test_fieldperp.cxx b/tests/unit/field/test_fieldperp.cxx index 82bf39f880..4975a9e735 100644 --- a/tests/unit/field/test_fieldperp.cxx +++ b/tests/unit/field/test_fieldperp.cxx @@ -1738,6 +1738,22 @@ TEST_F(FieldPerpTest, Max) { EXPECT_EQ(max(field, true, "RGN_ALL"), 99.0); } +TEST_F(FieldPerpTest, MaxBinaryExpr) { + FieldPerp field; + field.setIndex(0); + + field = 50.0; + field(0, 0) = -99.0; + field(1, 1) = 40.0; + field(1, 2) = 60.0; + field(2, 4) = 99.0; + + const auto expr = field / 2.0 - 5.0; + + EXPECT_EQ(max(expr, false), 25.0); + EXPECT_EQ(max(expr, false, "RGN_ALL"), 44.5); +} + TEST_F(FieldPerpTest, OperatorEqualsFieldPerp) { FieldPerp field; diff --git a/tests/unit/test_extras.cxx b/tests/unit/test_extras.cxx index f096292ed3..30526f55c4 100644 --- a/tests/unit/test_extras.cxx +++ b/tests/unit/test_extras.cxx @@ -69,3 +69,16 @@ TEST_F(TestExtrasFieldExpr, BinaryExprCanBeIndexedWithRegionIndex) { EXPECT_DOUBLE_EQ(result[i], lhs[i] + 2.0 * rhs[i]); } } + +TEST_F(TestExtrasFieldExpr, Field2DBinaryExprCanBeIndexedWithInd3D) { + const Field2D lhs{ + makeField([](const Ind2D& i) { return static_cast(i.x()); })}; + const Field2D rhs{ + makeField([](const Ind2D& i) { return static_cast(i.y()); })}; + + const auto expr = lhs + 2.0 * rhs; + + BOUT_FOR(i, lhs.getMesh()->getRegion3D("RGN_ALL")) { + EXPECT_DOUBLE_EQ(expr[i], lhs[i] + 2.0 * rhs[i]); + } +} From 93507154f5ae15e7ecde376516430b7a4d68c07a Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 18 Jun 2026 17:25:33 -0700 Subject: [PATCH 41/58] Fieldops: Tidy Min,Max,Mean reductions Move State types into their respective reduction classes. --- include/bout/fieldops.hxx | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 4b5eab2f24..8ae632c612 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -104,21 +104,10 @@ struct Div { namespace reduce { -struct MinState { - BoutReal value; -}; - -struct MaxState { - BoutReal value; -}; - -struct MeanState { - BoutReal sum; - int count; -}; - struct Min { - using State = MinState; + struct State { + BoutReal value; + }; BOUT_HOST_DEVICE static State identity() { return {std::numeric_limits::infinity()}; @@ -133,7 +122,9 @@ struct Min { }; struct Max { - using State = MaxState; + struct State { + BoutReal value; + }; BOUT_HOST_DEVICE static State identity() { return {-std::numeric_limits::infinity()}; @@ -148,7 +139,10 @@ struct Max { }; struct Mean { - using State = MeanState; + struct State { + BoutReal sum; + int count; + }; BOUT_HOST_DEVICE static State identity() { return {0.0, 0}; } BOUT_HOST_DEVICE static void accumulate(State& state, BoutReal value) { From 36c06ec62641d42ac9ba96b01f5ef007c952eac5 Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 18 Jun 2026 17:27:40 -0700 Subject: [PATCH 42/58] Formatting --- include/bout/assert.hxx | 6 +++--- include/bout/rajalib.hxx | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/bout/assert.hxx b/include/bout/assert.hxx index 954ae8dba0..28bff1f2ec 100644 --- a/include/bout/assert.hxx +++ b/include/bout/assert.hxx @@ -2,16 +2,16 @@ * Defines a macro ASSERT which throws a BoutException if a given * condition is false. Whether the assertion is tested depends on * the checking level, so assetions can be removed for optimised runs. - * + * * ASSERT ( condition ) * * level - An integer known at compile time. * condition tested if level >= CHECK * * condition - The expression to test - * + * * e.g. ASSERT2( condition ) will only test condition if CHECK >= 2 - * + * */ #ifndef BOUT_ASSERT_H diff --git a/include/bout/rajalib.hxx b/include/bout/rajalib.hxx index 20929304b5..29bab8f23b 100644 --- a/include/bout/rajalib.hxx +++ b/include/bout/rajalib.hxx @@ -137,7 +137,7 @@ private: /// to create variables which shadow the class members. /// #define BOUT_FOR_RAJA(index, region, ...) \ -RajaForAll(region) << [ =, ##__VA_ARGS__ ] RAJA_DEVICE(int index) mutable + RajaForAll(region) << [ =, ##__VA_ARGS__ ] RAJA_DEVICE(int index) mutable #else // BOUT_HAS_RAJA From 09aaee6b28fc4aacb16e333d958fdb02b6ce2a9c Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 18 Jun 2026 17:42:27 -0700 Subject: [PATCH 43/58] BinaryExpr: Use Region::getLinearIndices Caches the indicex array rather than recreating it each time. --- include/bout/fieldops.hxx | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 8ae632c612..ee746f886d 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -322,13 +322,8 @@ struct BinaryExpr { BinaryExpr(const typename L::View& lhs, const typename R::View& rhs, Func f, Mesh* mesh, CELL_LOC location, DirectionTypes directions, std::optional regionID, const Region& region) - : lhs(lhs), rhs(rhs), indices(region.getIndices().size()), f(f), mesh(mesh), - location(location), directions(directions), regionID(regionID) { - // Copy the region indices into the managed array - for (int i = 0; i < indices.size(); ++i) { - indices[i] = region.getIndices()[i].ind; - } - } + : lhs(lhs), rhs(rhs), indices(region.getLinearIndices()), f(f), mesh(mesh), + location(location), directions(directions), regionID(regionID) {} BinaryExpr(const typename L::View& lhs, const typename R::View& rhs, Func f, Mesh* mesh, CELL_LOC location, DirectionTypes directions, std::optional regionID, From 94cf70fe73d75bf375a5b24afa71cc030efc43d3 Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 18 Jun 2026 17:59:28 -0700 Subject: [PATCH 44/58] Options: Assignment from BinaryExpr Evaluates the expression to its result field type (ResT) and stores that in the Option. --- include/bout/options.hxx | 31 ++++++++++++++++---------- tests/unit/sys/test_options_fields.cxx | 30 +++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/include/bout/options.hxx b/include/bout/options.hxx index 03e95488a8..87503b0a22 100644 --- a/include/bout/options.hxx +++ b/include/bout/options.hxx @@ -74,19 +74,19 @@ class Options; * which can be used as a map. * * Options options; - * + * * // Set values * options["key"] = 1.0; * * // Get values. Throws BoutException if not found - * int val = options["key"]; // Sets val to 1 + * int val = options["key"]; // Sets val to 1 * * // Return as specified type. Throws BoutException if not found * BoutReal var = options["key"].as(); * * // A default value can be used if key is not found * BoutReal value = options["pi"].withDefault(3.14); - * + * * // Assign value with source label. Throws if already has a value from same source * options["newkey"].assign(1.0, "some source"); * @@ -94,7 +94,7 @@ class Options; * options["newkey"].force(2.0, "some source"); * * A legacy interface is also supported: - * + * * options.set("key", 1.0, "code"); // Sets a key from source "code" * * int val; @@ -119,9 +119,9 @@ class Options; * * Each Options object can also contain any number of sections, which are * themselves Options objects. - * + * * Options §ion = options["section"]; - * + * * which can be nested: * * options["section"]["subsection"]["value"] = 3; @@ -134,13 +134,13 @@ class Options; * * e.g. * options->getSection("section")->getSection("subsection")->set("value", 3); - * + * * Options also know about their parents: * * Options &parent = section.parent(); - * + * * or - * + * * Options *parent = section->getParent(); * * Root options object @@ -150,8 +150,8 @@ class Options; * there is a global singleton Options object which can be accessed with a static function * * Options &root = Options::root(); - * - * or + * + * or * * Options *root = Options::getRoot(); * @@ -193,7 +193,7 @@ public: /// @param[in] parent Parent object /// @param[in] sectionName Name of the section, including path from the root Options(Options* parent_instance, std::string full_name) - : parent_instance(parent_instance), full_name(std::move(full_name)){}; + : parent_instance(parent_instance), full_name(std::move(full_name)) {}; /// Initialise with a value /// These enable Options to be constructed using initializer lists @@ -441,6 +441,13 @@ public: return inputvalue; } + template + ResT operator=(const BinaryExpr& expr) { + ResT value{expr}; + assign(value); + return value; + } + /// Assign a value to the option. /// This will throw an exception if already has a value /// diff --git a/tests/unit/sys/test_options_fields.cxx b/tests/unit/sys/test_options_fields.cxx index 0c6ec953af..f94ed30386 100644 --- a/tests/unit/sys/test_options_fields.cxx +++ b/tests/unit/sys/test_options_fields.cxx @@ -36,6 +36,36 @@ TEST_F(OptionsFieldTest, StoreField2D) { EXPECT_TRUE(options.isValue()); } +TEST_F(OptionsFieldTest, StoreEvaluatedField3DExpression) { + Field3D lhs = 1.0; + Field3D rhs = 2.0; + lhs(0, 1, 1) = 3.0; + rhs(0, 1, 1) = 4.0; + + Options options; + options = lhs + rhs; + + Field3D stored = options; + + EXPECT_DOUBLE_EQ(stored(0, 1, 0), 3.0); + EXPECT_DOUBLE_EQ(stored(0, 1, 1), 7.0); +} + +TEST_F(OptionsFieldTest, StoreEvaluatedField2DExpression) { + Field2D lhs = 1.0; + Field2D rhs = 2.0; + lhs(0, 1) = 3.0; + rhs(0, 1) = 4.0; + + Options options; + options = lhs + rhs; + + Field2D stored = options; + + EXPECT_DOUBLE_EQ(stored(0, 0), 3.0); + EXPECT_DOUBLE_EQ(stored(0, 1), 7.0); +} + TEST_F(OptionsFieldTest, RetrieveField3D) { Field3D field = 1.0; field(0, 1, 1) = 2.0; From d3236d9ba3658b48bc62650f93fecddffb806933 Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 18 Jun 2026 21:36:19 -0700 Subject: [PATCH 45/58] BinaryExpr: if_else and if_else_zero conditionals Binary expressions that take a boolean and branch between two expressions. `if_else_zero(bool, Expr)` evaluates an expression only if `bool` is true, otherwise evaluates to zero. --- include/bout/field2d.hxx | 69 +++++++++++++++++++++++++++++++ include/bout/field3d.hxx | 68 ++++++++++++++++++++++++++++++ include/bout/fieldops.hxx | 12 ++++++ tests/unit/CMakeLists.txt | 1 + tests/unit/field/test_if_else.cxx | 57 +++++++++++++++++++++++++ 5 files changed, 207 insertions(+) create mode 100644 tests/unit/field/test_if_else.cxx diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index c7d8252aaa..61cc4693a3 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -458,6 +458,75 @@ FIELD2D_BOUTREAL_FIELD2D_OP(-, Sub) FIELD2D_BOUTREAL_FIELD2D_OP(*, Mul) FIELD2D_BOUTREAL_FIELD2D_OP(/, Div) +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr> +if_else(bool condition, const L& lhs, const R& rhs) { + return BinaryExpr{ + static_cast(lhs), + static_cast(rhs), + bout::op::IfElse{condition}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + std::nullopt, + lhs.getMesh()->getRegion2D("RGN_ALL")}; +} + +template +std::enable_if_t && is_expr_field3d_v, + BinaryExpr> +if_else(bool condition, const L& lhs, const R& rhs) { + ASSERT1_EXPR_COMPATIBLE(lhs, rhs); + auto regionID = rhs.getRegionID(); + int mesh_nz = rhs.getMesh()->LocalNz; + return BinaryExpr{ + static_cast(lhs).setScale(1, mesh_nz), + static_cast(rhs), + bout::op::IfElse{condition}, + rhs.getMesh(), + rhs.getLocation(), + rhs.getDirections(), + regionID, + rhs.getMesh()->getRegion("RGN_ALL")}; +} + +template +std::enable_if_t && is_expr_constant_v, + BinaryExpr, bout::op::IfElse>> +if_else(bool condition, const L& lhs, R rhs) { + return BinaryExpr, bout::op::IfElse>{ + static_cast(lhs), + static_cast::View>(rhs), + bout::op::IfElse{condition}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + std::nullopt, + lhs.getMesh()->getRegion2D("RGN_ALL")}; +} + +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr, R, bout::op::IfElse>> +if_else(bool condition, L lhs, const R& rhs) { + return BinaryExpr, R, bout::op::IfElse>{ + static_cast::View>(lhs), + static_cast(rhs), + bout::op::IfElse{condition}, + rhs.getMesh(), + rhs.getLocation(), + rhs.getDirections(), + std::nullopt, + rhs.getMesh()->getRegion2D("RGN_ALL")}; +} + +template || is_expr_field3d_v>> +auto if_else_zero(bool condition, const L& lhs) { + return if_else(condition, lhs, 0.0); +} + /*! * Unary minus. Returns the negative of given field, * iterates over whole domain including guard/boundary cells. diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index e27a7600d2..2c19c5e505 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -754,6 +754,74 @@ FIELD3D_BOUTREAL_FIELD3D_OP(-, Sub) FIELD3D_BOUTREAL_FIELD3D_OP(*, Mul) FIELD3D_BOUTREAL_FIELD3D_OP(/, Div) +template && is_expr_field3d_v>> +BinaryExpr if_else(bool condition, const L& lhs, + const R& rhs) { + ASSERT1_EXPR_COMPATIBLE(lhs, rhs); + auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); + return BinaryExpr{ + static_cast(lhs), + static_cast(rhs), + bout::op::IfElse{condition}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; +} + +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr> +if_else(bool condition, const L& lhs, const R& rhs) { + ASSERT1_EXPR_COMPATIBLE(lhs, rhs); + auto regionID = lhs.getRegionID(); + int mesh_nz = lhs.getMesh()->LocalNz; + return BinaryExpr{ + static_cast(lhs), + static_cast(rhs).setScale(1, mesh_nz), + bout::op::IfElse{condition}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + lhs.getMesh()->getRegion("RGN_ALL")}; +} + +template +std::enable_if_t && is_expr_constant_v, + BinaryExpr, bout::op::IfElse>> +if_else(bool condition, const L& lhs, R rhs) { + auto regionID = lhs.getRegionID(); + return BinaryExpr, bout::op::IfElse>{ + static_cast(lhs), + static_cast::View>(rhs), + bout::op::IfElse{condition}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + lhs.getMesh()->getRegion("RGN_ALL")}; +} + +template +std::enable_if_t && is_expr_field3d_v, + BinaryExpr, R, bout::op::IfElse>> +if_else(bool condition, const L& lhs, const R& rhs) { + auto regionID = rhs.getRegionID(); + return BinaryExpr, R, bout::op::IfElse>{ + static_cast::View>(lhs), + static_cast(rhs), + bout::op::IfElse{condition}, + rhs.getMesh(), + rhs.getLocation(), + rhs.getDirections(), + regionID, + rhs.getMesh()->getRegion("RGN_ALL")}; +} + Field3DParallel operator+(const Field3D& lhs, const Field3DParallel& rhs); Field3DParallel operator-(const Field3D& lhs, const Field3DParallel& rhs); Field3DParallel operator*(const Field3D& lhs, const Field3DParallel& rhs); diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index ee746f886d..923c8078ef 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -100,6 +100,18 @@ struct Div { return a / b; } }; +struct IfElse { + bool condition; + + template + BOUT_HOST_DEVICE BOUT_FORCEINLINE BoutReal operator()(int idx, const LView& L, + const RView& R) const { + return condition ? L(idx) : R(idx); + } + BOUT_HOST_DEVICE BOUT_FORCEINLINE BoutReal operator()(BoutReal a, BoutReal b) const { + return condition ? a : b; + } +}; }; // namespace op namespace reduce { diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 4ba304b484..4963aaf1f0 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -60,6 +60,7 @@ set(serial_tests_source ./field/test_field3d.cxx ./field/test_field_factory.cxx ./field/test_fieldgroup.cxx + ./field/test_if_else.cxx ./field/test_fieldperp.cxx ./field/test_initialprofiles.cxx ./field/test_vector2d.cxx diff --git a/tests/unit/field/test_if_else.cxx b/tests/unit/field/test_if_else.cxx new file mode 100644 index 0000000000..c608aa79db --- /dev/null +++ b/tests/unit/field/test_if_else.cxx @@ -0,0 +1,57 @@ +#include "gtest/gtest.h" + +#include "test_extras.hxx" +#include "bout/field2d.hxx" +#include "bout/field3d.hxx" + +#include "fake_mesh_fixture.hxx" + +#include + +using IfElseTest = FakeMeshFixture; + +TEST_F(IfElseTest, Field2DChoosesSelectedBranch) { + const Field2D lhs{makeField( + [](const Ind2D& i) { return static_cast(i.x() + i.y()); })}; + const Field2D rhs{makeField( + [](const Ind2D& i) { return static_cast(10 + i.x() - i.y()); })}; + + const auto expr = if_else(true, lhs, rhs); + + static_assert(std::is_same_v, + BinaryExpr>); + EXPECT_TRUE(IsFieldEqual(expr, lhs)); + EXPECT_TRUE(IsFieldEqual(if_else(false, lhs, rhs), rhs)); +} + +TEST_F(IfElseTest, Field3DMixesField2DAndField3D) { + const Field2D lhs{makeField( + [](const Ind2D& i) { return static_cast(i.x() + 2 * i.y()); })}; + const Field3D rhs{makeField( + [](const Ind3D& i) { return static_cast(100 + i.x() + i.y() + i.z()); })}; + + const auto expr = if_else(true, lhs, rhs); + const Field3D expected{lhs}; + + static_assert(std::is_same_v, + BinaryExpr>); + EXPECT_TRUE(IsFieldEqual(expr, expected)); + EXPECT_TRUE(IsFieldEqual(if_else(false, lhs, rhs), rhs)); +} + +TEST_F(IfElseTest, IfElseZeroKeepsExpressionWhenConditionTrue) { + const Field3D field{makeField( + [](const Ind3D& i) { return static_cast(1 + i.x() + i.y() + i.z()); })}; + const auto source = 2.0 * field + 1.0; + + EXPECT_TRUE(IsFieldEqual(if_else_zero(true, source), source)); + EXPECT_TRUE(IsFieldEqual(if_else_zero(false, source), 0.0)); +} + +TEST_F(IfElseTest, InactiveBranchIsNotEvaluatedThroughMaskedArithmetic) { + const Field2D lhs{makeField( + [](const Ind2D& i) { return static_cast(1 + i.x() + i.y()); })}; + const Field2D rhs{filledFrom(lhs, BoutNaN)}; + + EXPECT_TRUE(IsFieldEqual(if_else(true, lhs, rhs), lhs)); +} From f48cd517ed09b5fc458487f969b5587095fe2373 Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 18 Jun 2026 21:39:32 -0700 Subject: [PATCH 46/58] BinaryExpr: FIELD_FUNC fuse expressions If operating on a BinaryExpr, lazily evaluate field functions like `abs`, `sin`, `tanh`. This avoids allocating a field and evaluating the expression. --- include/bout/field.hxx | 16 ++++++++++++++-- tests/unit/field/test_field2d.cxx | 13 +++++++++++++ tests/unit/field/test_field3d.cxx | 13 +++++++++++++ 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/include/bout/field.hxx b/include/bout/field.hxx index ad365e8ec1..6547c0a31d 100644 --- a/include/bout/field.hxx +++ b/include/bout/field.hxx @@ -644,8 +644,20 @@ class Field3DParallel; } \ } \ template \ - inline auto name(const BinaryExpr& f, \ - const std::string& rgn = "RGN_ALL") { \ + inline auto name(const BinaryExpr& f) { \ + return BinaryExpr, BinaryExpr, \ + bout::op::name>{ \ + static_cast::View>(f), \ + static_cast::View>(f), \ + bout::op::name{}, \ + f.getMesh(), \ + f.getLocation(), \ + f.getDirections(), \ + f.getRegionID(), \ + f.indices}; \ + } \ + template \ + inline auto name(const BinaryExpr& f, const std::string& rgn) { \ return name(ResT{f}, rgn); \ } #endif diff --git a/tests/unit/field/test_field2d.cxx b/tests/unit/field/test_field2d.cxx index 9eebb276ba..af98d50d34 100644 --- a/tests/unit/field/test_field2d.cxx +++ b/tests/unit/field/test_field2d.cxx @@ -1195,6 +1195,19 @@ TEST_F(Field2DTest, Abs) { EXPECT_TRUE(IsFieldEqual(abs(field), 31.0)); } +TEST_F(Field2DTest, AbsExpressionUsesAbsOp) { + Field2D field; + + field = -2.0; + const auto expr = field + 1.0; + + EXPECT_TRUE((std::is_same_v, + BinaryExpr, + std::decay_t, bout::op::abs>>)); + EXPECT_TRUE(IsFieldEqual(abs(expr), 1.0)); + EXPECT_TRUE(IsFieldEqual(abs(expr, "RGN_ALL"), 1.0)); +} + TEST_F(Field2DTest, Exp) { Field2D field; diff --git a/tests/unit/field/test_field3d.cxx b/tests/unit/field/test_field3d.cxx index 2941bf3d89..6e1bc8ba58 100644 --- a/tests/unit/field/test_field3d.cxx +++ b/tests/unit/field/test_field3d.cxx @@ -1986,6 +1986,19 @@ TEST_F(Field3DTest, Abs) { EXPECT_TRUE(IsFieldEqual(abs(field), 31.0)); } +TEST_F(Field3DTest, AbsExpressionUsesAbsOp) { + Field3D field; + + field = -2.0; + const auto expr = field + 1.0; + + EXPECT_TRUE((std::is_same_v, + BinaryExpr, + std::decay_t, bout::op::abs>>)); + EXPECT_TRUE(IsFieldEqual(abs(expr), 1.0)); + EXPECT_TRUE(IsFieldEqual(abs(expr, "RGN_ALL"), 1.0)); +} + TEST_F(Field3DTest, Exp) { Field3D field; From ac1836b30ab04c6f05ab5f9aa8d3767f31dd4cb9 Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 18 Jun 2026 22:08:19 -0700 Subject: [PATCH 47/58] Fixes for 3D metrics and unit tests Mostly specifying when to force evaluation of expressions (BinaryExpr trees) into fields. --- src/field/vecops.cxx | 29 ++++++++++--------- .../laplace/impls/petsc3damg/petsc3damg.cxx | 6 ++-- src/mesh/coordinates.cxx | 13 +++++---- src/mesh/parallel/fci.cxx | 2 +- src/sys/derivs.cxx | 29 ++++++++++++------- tests/unit/solver/test_nvector.cxx | 2 +- 6 files changed, 45 insertions(+), 36 deletions(-) diff --git a/src/field/vecops.cxx b/src/field/vecops.cxx index 65a6b7e938..672e8f6c09 100644 --- a/src/field/vecops.cxx +++ b/src/field/vecops.cxx @@ -1,11 +1,10 @@ /************************************************************************** * Operators on vector objects - * B.Dudson, October 2007 * ************************************************************************** - * Copyright 2010 B.D.Dudson, S.Farley, M.V.Umansky, X.Q.Xu + * Copyright 2010 - 2026 BOUT++ contributors * - * Contact: Ben Dudson, bd512@york.ac.uk + * Contact: Ben Dudson, dudson2@llnl.gov * * This file is part of BOUT++. * @@ -161,8 +160,8 @@ Coordinates::FieldMetric Div(const Vector2D& v, CELL_LOC outloc, vcn.toContravariant(); Coordinates::FieldMetric result = DDX(metric->J * vcn.x, outloc, method); - result += DDY(metric->J * vcn.y, outloc, method); - result += DDZ(metric->J * vcn.z, outloc, method); + result += DDY(Coordinates::FieldMetric{metric->J * vcn.y}, outloc, method); + result += DDZ(Coordinates::FieldMetric{metric->J * vcn.z}, outloc, method); result /= metric->J; return result; @@ -195,8 +194,8 @@ Field3D Div(const Vector3D& v, CELL_LOC outloc, const std::string& method) { } auto result = DDY(vcnJy, outloc, method); - result += DDX(vcn.x.getCoordinates()->J * vcn.x, outloc, method); - result += DDZ(vcn.z.getCoordinates()->J * vcn.z, outloc, method); + result += DDX(Field3D{vcn.x.getCoordinates()->J * vcn.x}, outloc, method); + result += DDZ(Field3D{vcn.z.getCoordinates()->J * vcn.z}, outloc, method); result /= metric->J; return result; @@ -224,10 +223,12 @@ Coordinates::FieldMetric Div(const Vector2D& v, const Field2D& f, CELL_LOC outlo Vector2D vcn = v; vcn.toContravariant(); - Coordinates::FieldMetric result = - FDDX(vcn.x.getCoordinates()->J * vcn.x, f, outloc, method); - result += FDDY(vcn.y.getCoordinates()->J * vcn.y, f, outloc, method); - result += FDDZ(vcn.z.getCoordinates()->J * vcn.z, f, outloc, method); + Coordinates::FieldMetric result = FDDX( + Coordinates::FieldMetric{vcn.x.getCoordinates()->J * vcn.x}, f, outloc, method); + result += FDDY(Coordinates::FieldMetric{vcn.y.getCoordinates()->J * vcn.y}, f, outloc, + method); + result += FDDZ(Coordinates::FieldMetric{vcn.z.getCoordinates()->J * vcn.z}, f, outloc, + method); result /= metric->J; return result; @@ -249,9 +250,9 @@ Field3D Div(const Vector3D& v, const Field3D& f, CELL_LOC outloc, Vector3D vcn = v; vcn.toContravariant(); - Field3D result = FDDX(vcn.x.getCoordinates()->J * vcn.x, f, outloc, method); - result += FDDY(vcn.y.getCoordinates()->J * vcn.y, f, outloc, method); - result += FDDZ(vcn.z.getCoordinates()->J * vcn.z, f, outloc, method); + Field3D result = FDDX(Field3D{vcn.x.getCoordinates()->J * vcn.x}, f, outloc, method); + result += FDDY(Field3D{vcn.y.getCoordinates()->J * vcn.y}, f, outloc, method); + result += FDDZ(Field3D{vcn.z.getCoordinates()->J * vcn.z}, f, outloc, method); result /= metric->J; return result; diff --git a/src/invert/laplace/impls/petsc3damg/petsc3damg.cxx b/src/invert/laplace/impls/petsc3damg/petsc3damg.cxx index 76b9b94e9a..9966ad654d 100644 --- a/src/invert/laplace/impls/petsc3damg/petsc3damg.cxx +++ b/src/invert/laplace/impls/petsc3damg/petsc3damg.cxx @@ -299,7 +299,7 @@ void LaplacePetsc3dAmg::updateMatrix3D() { const Field3D dc_dx = issetC ? DDX(C2) : Field3D(); const Field3D dc_dy = issetC ? DDY(C2) : Field3D(); const Field3D dc_dz = issetC ? DDZ(C2) : Field3D(); - const auto dJ_dy = DDY(coords->J / coords->g_22); + const auto dJ_dy = DDY(Coordinates::FieldMetric{coords->J / coords->g_22}); // Set up the matrix for the internal points on the grid. // Boundary conditions were set in the constructor. @@ -384,7 +384,7 @@ void LaplacePetsc3dAmg::updateMatrix3D() { // Must add these (rather than assign) so that elements used in // interpolation don't overwrite each other. BOUT_FOR_SERIAL(l, indexer->getRegionNobndry()) { - BoutReal C_df_dy = (coords->G2[l] - dJ_dy[l] / coords->J[l]); + BoutReal C_df_dy = coords->G2[l] - (dJ_dy[l] / coords->J[l]); if (issetD) { C_df_dy *= D[l]; } @@ -395,7 +395,7 @@ void LaplacePetsc3dAmg::updateMatrix3D() { / C1[l]; } - BoutReal C_d2f_dy2 = (coords->g22[l] - 1.0 / coords->g_22[l]); + BoutReal C_d2f_dy2 = coords->g22[l] - (1.0 / coords->g_22[l]); if (issetD) { C_d2f_dy2 *= D[l]; } diff --git a/src/mesh/coordinates.cxx b/src/mesh/coordinates.cxx index 8980b00695..0004e4673f 100644 --- a/src/mesh/coordinates.cxx +++ b/src/mesh/coordinates.cxx @@ -546,7 +546,7 @@ Coordinates::Coordinates(Mesh* mesh, Options* options) transform.get()); // Compare calculated and loaded values - Field2D diff = J - Jcalc; + const auto diff = J - Jcalc; output_warn.write("\tMaximum difference in J is {:e}\n", max(abs(diff))); mesh->communicate_no_slices(J); @@ -1140,7 +1140,7 @@ int Coordinates::geometry(bool recalculate_staggered, if (localmesh->get(d2z, "d2z" + suffix, 0.0, false)) { output_warn.write( "\tWARNING: differencing quantity 'd2z' not found. Calculating from dz\n"); - d1_dz = bout::derivatives::index::DDZ(1. / dz); + d1_dz = bout::derivatives::index::DDZ(FieldMetric{1. / dz}); localmesh->communicate_no_slices(d1_dz); d1_dz = interpolateAndExtrapolate(d1_dz, location, true, true, true, transform.get()); @@ -1174,7 +1174,7 @@ int Coordinates::geometry(bool recalculate_staggered, if (localmesh->get(d2y, "d2y", 0.0, false)) { output_warn.write( "\tWARNING: differencing quantity 'd2y' not found. Calculating from dy\n"); - d1_dy = DDY(1. / dy); // d/di(1/dy) + d1_dy = DDY(FieldMetric{1. / dy}); // d/di(1/dy) localmesh->communicate_no_slices(d1_dy); d1_dy = @@ -1190,7 +1190,7 @@ int Coordinates::geometry(bool recalculate_staggered, if (localmesh->get(d2z, "d2z", 0.0, false)) { output_warn.write( "\tWARNING: differencing quantity 'd2z' not found. Calculating from dz\n"); - d1_dz = bout::derivatives::index::DDZ(1. / dz); + d1_dz = bout::derivatives::index::DDZ(FieldMetric{1. / dz}); localmesh->communicate_no_slices(d1_dz); d1_dz = @@ -1578,7 +1578,7 @@ Coordinates::FieldMetric Coordinates::Div_par(const Field2D& f, CELL_LOC outloc, // Coordinates object auto Bxy_floc = f.getCoordinates()->Bxy; - return Bxy * Grad_par(f / Bxy_floc, outloc, method); + return Bxy * Grad_par(FieldMetric{f / Bxy_floc}, outloc, method); } Field3D Coordinates::Div_par(const Field3DParallel& f, CELL_LOC outloc, @@ -1771,7 +1771,8 @@ FieldPerp Coordinates::Delp2(const FieldPerp& f, CELL_LOC outloc, bool useFFT) { Coordinates::FieldMetric Coordinates::Laplace_par(const Field2D& f, CELL_LOC outloc) { ASSERT1(location == outloc || outloc == CELL_DEFAULT); - return D2DY2(f, outloc) / g_22 + DDY(J / g_22, outloc) * DDY(f, outloc) / J; + return D2DY2(f, outloc) / g_22 + + DDY(FieldMetric{J / g_22}, outloc) * DDY(f, outloc) / J; } Field3D Coordinates::Laplace_par(const Field3DParallel& f, CELL_LOC outloc) { diff --git a/src/mesh/parallel/fci.cxx b/src/mesh/parallel/fci.cxx index 0b4ea5f6d1..f498c60e0e 100644 --- a/src/mesh/parallel/fci.cxx +++ b/src/mesh/parallel/fci.cxx @@ -471,7 +471,7 @@ void FCITransform::outputVars(Options& output_options) { void FCITransform::loadParallelMetrics(Coordinates* coords) { #if BOUT_USE_METRIC_3D output_info.write("\tLoading parallel metrics\n"); - const auto JB0 = coords->J * coords->Bxy; + const Coordinates::FieldMetric JB0 = coords->J * coords->Bxy; coords->J.splitParallelSlices(); coords->J.disallowCalcParallelSlices(); coords->J.resetRegionParallel(true); diff --git a/src/sys/derivs.cxx b/src/sys/derivs.cxx index aa42be66c2..e449dbcd30 100644 --- a/src/sys/derivs.cxx +++ b/src/sys/derivs.cxx @@ -17,9 +17,9 @@ * Div(v*f) * ************************************************************************** - * Copyright 2010 B.D.Dudson, S.Farley, M.V.Umansky, X.Q.Xu + * Copyright 2010 - 2026 BOUT++ contributors * - * Contact: Ben Dudson, bd512@york.ac.uk + * Contact: Ben Dudson, dudson2@llnl.gov * * This file is part of BOUT++. * @@ -38,16 +38,23 @@ * **************************************************************************/ +#include +#include #include #include +#include #include #include +#include +#include #include #include #include #include #include +#include + /******************************************************************************* * First central derivatives *******************************************************************************/ @@ -96,7 +103,7 @@ Coordinates::FieldMetric DDZ(const Field2D& f, CELL_LOC UNUSED(outloc), Vector3D DDZ(const Vector3D& v, CELL_LOC outloc, const std::string& method, const std::string& region) { Vector3D result(v.getMesh()); - Coordinates* metric = v.x.getCoordinates(outloc); + const Coordinates* metric = v.x.getCoordinates(outloc); if (v.covariant) { // From equation (2.6.32) in D'Haeseleer @@ -148,7 +155,7 @@ Vector2D DDZ(const Vector2D& v, CELL_LOC UNUSED(outloc), Field3D D2DX2(const Field3D& f, CELL_LOC outloc, const std::string& method, const std::string& region) { - Coordinates* coords = f.getCoordinates(outloc); + const Coordinates* coords = f.getCoordinates(outloc); Field3D result = bout::derivatives::index::D2DX2(f, outloc, method, region) / SQ(coords->dx); @@ -167,9 +174,9 @@ Field3D D2DX2(const Field3D& f, CELL_LOC outloc, const std::string& method, Coordinates::FieldMetric D2DX2(const Field2D& f, CELL_LOC outloc, const std::string& method, const std::string& region) { - Coordinates* coords = f.getCoordinates(outloc); + const Coordinates* coords = f.getCoordinates(outloc); - Field2D result = + Coordinates::FieldMetric result = bout::derivatives::index::D2DX2(f, outloc, method, region) / SQ(coords->dx); if (coords->non_uniform) { @@ -185,7 +192,7 @@ Coordinates::FieldMetric D2DX2(const Field2D& f, CELL_LOC outloc, Field3D D2DY2(const Field3D& f, CELL_LOC outloc, const std::string& method, const std::string& region) { - Coordinates* coords = f.getCoordinates(outloc); + const Coordinates* coords = f.getCoordinates(outloc); Field3D result = bout::derivatives::index::D2DY2(f, outloc, method, region) / SQ(coords->dy); @@ -204,9 +211,9 @@ Field3D D2DY2(const Field3D& f, CELL_LOC outloc, const std::string& method, Coordinates::FieldMetric D2DY2(const Field2D& f, CELL_LOC outloc, const std::string& method, const std::string& region) { - Coordinates* coords = f.getCoordinates(outloc); + const Coordinates* coords = f.getCoordinates(outloc); - Field2D result = + Coordinates::FieldMetric result = bout::derivatives::index::D2DY2(f, outloc, method, region) / SQ(coords->dy); if (coords->non_uniform) { // Correction for non-uniform f.getMesh() @@ -286,7 +293,7 @@ Coordinates::FieldMetric D2DXDY(const Field2D& f, CELL_LOC outloc, const std::string& method, const std::string& region, const std::string& dfdy_boundary_condition, const std::string& dfdy_region) { - std::string dy_region = dfdy_region.empty() ? region : dfdy_region; + const std::string dy_region = dfdy_region.empty() ? region : dfdy_region; // If staggering in x, take y-derivative at f's location. const auto y_location = @@ -311,7 +318,7 @@ Coordinates::FieldMetric D2DXDY(const Field2D& f, CELL_LOC outloc, Field3D D2DXDY(const Field3D& f, CELL_LOC outloc, const std::string& method, const std::string& region, const std::string& dfdy_boundary_condition, const std::string& dfdy_region) { - std::string dy_region = dfdy_region.empty() ? region : dfdy_region; + const std::string dy_region = dfdy_region.empty() ? region : dfdy_region; // If staggering in x, take y-derivative at f's location. const auto y_location = diff --git a/tests/unit/solver/test_nvector.cxx b/tests/unit/solver/test_nvector.cxx index dc45008caa..18901b067f 100644 --- a/tests/unit/solver/test_nvector.cxx +++ b/tests/unit/solver/test_nvector.cxx @@ -130,7 +130,7 @@ TYPED_TEST(BoutNVectorTest, LinearAndPointwiseOperations) { auto vz = makeNVector(BoutNVector::create(this->sunctx, z, true)); N_VLinearSum(2.0, vx.get(), -1.0, vy.get(), vz.get()); - auto expected = 2.0 * x - y; + TypeParam expected = 2.0 * x - y; EXPECT_TRUE(IsFieldEqual(z, expected)); N_VProd(vx.get(), vy.get(), vz.get()); From b9de118a6d55e3333bcaf711604443bc70234e12 Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Fri, 19 Jun 2026 10:04:46 -0700 Subject: [PATCH 48/58] Fixes and tidying - Replaced `auto` with `Field3D` in a couple of places, to force evaluation of a BinaryExpr. - Removed some unnecessary intermediates, now that `abs` and `max` operate on BinaryExpr. - Clang tidy changes to headers etc. Integrated tests are now compiling locally for 2D and 3D metrics. --- include/bout/fieldops.hxx | 12 ++++------ include/bout/fieldperp.hxx | 19 ++++++++------- include/bout/interpolation.hxx | 24 +++++++++++++------ include/bout/utils.hxx | 6 +++-- .../laplace/impls/naulin/naulin_laplace.cxx | 7 +++--- src/mesh/coordinates.cxx | 13 ++++------ .../test-petsc_laplace/test_petsc_laplace.cxx | 2 +- .../invert/laplace/test_laplace_hypre3d.cxx | 10 ++++---- 8 files changed, 50 insertions(+), 43 deletions(-) diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 923c8078ef..df513c0290 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -3,12 +3,16 @@ #define BOUT_FIELDOPS_HXX #include "bout/array.hxx" +#include "bout/assert.hxx" #include "bout/bout_types.hxx" +#include "bout/build_config.hxx" +#include "bout/build_defines.hxx" +#include "bout/region.hxx" +#include #include #include #include -#include #if BOUT_HAS_CUDA #include @@ -198,12 +202,6 @@ __global__ void __launch_bounds__(THREADS) evaluatorExpr(BoutReal* out, const Ex int tid = threadIdx.x + blockIdx.x * blockDim.x; int e = expr.size(); - // In-bounds version - //if (tid < e) { - // int idx = expr.regionIdx(tid); - // out[idx] = expr(idx); // single‐pass fusion - //} - // Out-of-bounds version if (tid >= e) { return; diff --git a/include/bout/fieldperp.hxx b/include/bout/fieldperp.hxx index 74414d01d6..9c4a957ee8 100644 --- a/include/bout/fieldperp.hxx +++ b/include/bout/fieldperp.hxx @@ -23,24 +23,27 @@ * **************************************************************************/ -#include -#include -#include class FieldPerp; #ifndef BOUT_FIELDPERP_H #define BOUT_FIELDPERP_H -#include "bout/field.hxx" - #include "bout/array.hxx" #include "bout/assert.hxx" +#include "bout/bout_types.hxx" +#include "bout/build_config.hxx" +#include "bout/field.hxx" +#include "bout/fieldops.hxx" #include "bout/region.hxx" - #include "bout/unused.hxx" +#include "bout/utils.hxx" +#include +#include #include #include +#include +#include class Field2D; // #include "bout/field2d.hxx" class Field3D; // #include "bout/field3d.hxx" @@ -230,7 +233,7 @@ public: jx, jz, nx, nz); } #endif - return data[jx * nz + jz]; + return data[(jx * nz) + jz]; } /*! @@ -247,7 +250,7 @@ public: jx, jz, nx, nz); } #endif - return data[jx * nz + jz]; + return data[(jx * nz) + jz]; } /*! diff --git a/include/bout/interpolation.hxx b/include/bout/interpolation.hxx index bf4ac5779b..40ff603712 100644 --- a/include/bout/interpolation.hxx +++ b/include/bout/interpolation.hxx @@ -26,9 +26,19 @@ #ifndef BOUT_INTERP_H #define BOUT_INTERP_H +#include "bout/assert.hxx" +#include "bout/bout_types.hxx" +#include "bout/boutexception.hxx" +#include "bout/field2d.hxx" +#include "bout/field3d.hxx" #include "bout/mesh.hxx" +#include "bout/msg_stack.hxx" +#include "bout/region.hxx" #include "bout/stencils.hxx" +#include +#include + /// Perform interpolation between centre -> shifted or vice-versa /*! Interpolate using 4th-order staggered formula @@ -56,14 +66,14 @@ inline BoutReal interp(const stencil& s) { */ template std::enable_if_t || bout::utils::is_Field3D_v, const T> -interp_to(const T& var, CELL_LOC loc, const std::string region = "RGN_ALL") { +interp_to(const T& var, CELL_LOC loc, const std::string& region = "RGN_ALL") { static_assert(bout::utils::is_Field2D_v || bout::utils::is_Field3D_v, "interp_to must be templated with one of Field2D or Field3D."); ASSERT1(loc != CELL_DEFAULT); // doesn't make sense to interplote to CELL_DEFAULT Mesh* fieldmesh = var.getMesh(); - if ((loc != CELL_CENTRE) && (fieldmesh->StaggerGrids == false)) { + if ((loc != CELL_CENTRE) && !fieldmesh->StaggerGrids) { throw BoutException("Asked to interpolate, but StaggerGrids is disabled!"); } @@ -72,7 +82,7 @@ interp_to(const T& var, CELL_LOC loc, const std::string region = "RGN_ALL") { return var; } - // NOTE: invalidateGuards() is called in Field3D::alloctate() if the data + // NOTE: invalidateGuards() is called in Field3D::allocate() if the data // block is not already allocated, so will be called here if // region==RGN_NOBNDRY T result{emptyFrom(var).setLocation(loc)}; @@ -205,14 +215,14 @@ interp_to(const T& var, CELL_LOC loc, const std::string region = "RGN_ALL") { template std::enable_if_t && !bout::utils::is_Field3D_v, const Field3D> -interp_to(const E& expr, CELL_LOC loc, const std::string rgn = "RGN_ALL") { - return interp_to(Field3D{expr}, loc, std::move(rgn)); +interp_to(const E& expr, CELL_LOC loc, const std::string& rgn = "RGN_ALL") { + return interp_to(Field3D{expr}, loc, rgn); } template std::enable_if_t && !bout::utils::is_Field2D_v, const Field2D> -interp_to(const E& expr, CELL_LOC loc, const std::string rgn = "RGN_ALL") { - return interp_to(Field2D{expr}, loc, std::move(rgn)); +interp_to(const E& expr, CELL_LOC loc, const std::string& rgn = "RGN_ALL") { + return interp_to(Field2D{expr}, loc, rgn); } #endif // BOUT_INTERP_H diff --git a/include/bout/utils.hxx b/include/bout/utils.hxx index 6fb7d38dc2..3dbae60d74 100644 --- a/include/bout/utils.hxx +++ b/include/bout/utils.hxx @@ -29,12 +29,11 @@ #ifndef BOUT_UTILS_H #define BOUT_UTILS_H -#include "bout/build_config.hxx" - #include "bout/array.hxx" #include "bout/assert.hxx" #include "bout/bout_types.hxx" #include "bout/boutexception.hxx" +#include "bout/build_config.hxx" #include "bout/region.hxx" #include "bout/unused.hxx" @@ -46,6 +45,9 @@ #include #include #include +#include +#include +#include class Field; diff --git a/src/invert/laplace/impls/naulin/naulin_laplace.cxx b/src/invert/laplace/impls/naulin/naulin_laplace.cxx index 647ed08bd9..ae8e78d1ff 100644 --- a/src/invert/laplace/impls/naulin/naulin_laplace.cxx +++ b/src/invert/laplace/impls/naulin/naulin_laplace.cxx @@ -18,9 +18,9 @@ * ========= * ************************************************************************** - * Copyright 2018 B.D.Dudson, M. Loiten, J. Omotani + * Copyright 2018 - 2026 BOUT++ contributors * - * Contact: Ben Dudson, benjamin.dudson@york.ac.uk + * Contact: Ben Dudson, dudson2@llnl.gov * * This file is part of BOUT++. * @@ -274,9 +274,8 @@ Field3D LaplaceNaulin::solve(const Field3D& rhs, const Field3D& x0) { delp2solver->setCoefC2(C2coef_DC); // Use this below to normalize error for relative error estimate - Field3D SQField = SQ(rhsOverD); BoutReal RMS_rhsOverD = sqrt(mean( - SQField, true, + SQ(rhsOverD), true, "RGN_NOBNDRY")); // use sqrt(mean(SQ)) to make sure we do not divide by zero at a point BoutReal error_rel = 1e20, error_abs = 1e20, last_error = error_abs; diff --git a/src/mesh/coordinates.cxx b/src/mesh/coordinates.cxx index 0004e4673f..a4602a5527 100644 --- a/src/mesh/coordinates.cxx +++ b/src/mesh/coordinates.cxx @@ -546,8 +546,7 @@ Coordinates::Coordinates(Mesh* mesh, Options* options) transform.get()); // Compare calculated and loaded values - const auto diff = J - Jcalc; - output_warn.write("\tMaximum difference in J is {:e}\n", max(abs(diff))); + output_warn.write("\tMaximum difference in J is {:e}\n", max(abs(J - Jcalc))); mesh->communicate_no_slices(J); @@ -572,8 +571,7 @@ Coordinates::Coordinates(Mesh* mesh, Options* options) Bxy = interpolateAndExtrapolate(Bxy, location, extrapolate_x, extrapolate_y, false, transform.get()); - FieldMetric diff = Bxy - Bcalc; - output_warn.write("\tMaximum difference in Bxy is {:e}\n", max(abs(diff))); + output_warn.write("\tMaximum difference in Bxy is {:e}\n", max(abs(Bxy - Bcalc))); } // Check Bxy @@ -757,9 +755,8 @@ Coordinates::Coordinates(Mesh* mesh, Options* options, const CELL_LOC loc, J = interpolateAndExtrapolate(J, location, extrapolate_x, extrapolate_y, false, transform.get()); - FieldMetric diff = J - Jcalc; // Compare calculated and loaded values - output_warn.write("\tMaximum difference in J is %e\n", max(abs(diff))); + output_warn.write("\tMaximum difference in J is %e\n", max(abs(J - Jcalc))); // Re-evaluate Bxy using new J Bxy = sqrt(g_22) / J; @@ -783,9 +780,7 @@ Coordinates::Coordinates(Mesh* mesh, Options* options, const CELL_LOC loc, } else { Bxy = interpolateAndExtrapolate(Bxy, location, extrapolate_x, extrapolate_y, false, transform.get()); - - FieldMetric diff = Bxy - Bcalc; - output_warn.write("\tMaximum difference in Bxy is %e\n", max(abs(diff))); + output_warn.write("\tMaximum difference in Bxy is %e\n", max(abs(Bxy - Bcalc))); } // Check Bxy diff --git a/tests/integrated/test-petsc_laplace/test_petsc_laplace.cxx b/tests/integrated/test-petsc_laplace/test_petsc_laplace.cxx index 39b1918480..04182293e7 100644 --- a/tests/integrated/test-petsc_laplace/test_petsc_laplace.cxx +++ b/tests/integrated/test-petsc_laplace/test_petsc_laplace.cxx @@ -99,7 +99,7 @@ void check_laplace(int test_num, std::string_view test_name, Laplacian& invert, template Field3D forward_laplace(const Field3D& field, const T& acoef, const T& ccoef, const T& dcoef) { - auto bcoef = + Field3D bcoef = dcoef * Delp2(field) + Grad_perp(ccoef) * Grad_perp(field) / ccoef + acoef * field; apply_flat_boundary(bcoef); return bcoef; diff --git a/tests/unit/invert/laplace/test_laplace_hypre3d.cxx b/tests/unit/invert/laplace/test_laplace_hypre3d.cxx index 3b1bbc5d39..a721b96833 100644 --- a/tests/unit/invert/laplace/test_laplace_hypre3d.cxx +++ b/tests/unit/invert/laplace/test_laplace_hypre3d.cxx @@ -1,5 +1,7 @@ #include "bout/build_defines.hxx" +#if BOUT_HAS_HYPRE + #include #include @@ -18,8 +20,6 @@ #include "bout/options.hxx" #include "bout/vecops.hxx" -#if BOUT_HAS_HYPRE - #include "fake_mesh_fixture.hxx" // The unit tests use the global mesh @@ -39,9 +39,9 @@ class ForwardOperator { } const Field3D operator()(Field3D& f) { - auto result = d * Laplace_perp(f, CELL_DEFAULT, "free", "RGN_NOY") - + (Grad(f) * Grad(c2) - DDY(c2) * DDY(f) / coords->g_22) / c1 + a * f - + ex * DDX(f) + ez * DDZ(f); + Field3D result = d * Laplace_perp(f, CELL_DEFAULT, "free", "RGN_NOY") + + (Grad(f) * Grad(c2) - DDY(c2) * DDY(f) / coords->g_22) / c1 + a * f + + ex * DDX(f) + ez * DDZ(f); applyBoundaries(result, f); return result; } From 0fdeedbb5225db95c8b11a8f7cba4dc4f027ea3a Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Fri, 19 Jun 2026 11:30:27 -0700 Subject: [PATCH 49/58] evaluateBinaryExpr: Allocate full size Arrays The data arrays should be the full size of the field, not the size of the expression domain. --- include/bout/field2d.hxx | 5 ++++- include/bout/field3d.hxx | 5 ++++- include/bout/fieldperp.hxx | 5 ++++- tests/unit/field/test_field2d.cxx | 10 ++++++++++ tests/unit/field/test_field3d.cxx | 10 ++++++++++ tests/unit/field/test_fieldperp.cxx | 11 +++++++++++ 6 files changed, 43 insertions(+), 3 deletions(-) diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index 61cc4693a3..807b0ac1a3 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -350,7 +350,10 @@ public: private: template static Array evaluateBinaryExpr(const BinaryExpr& expr) { - Array data{expr.size()}; + const auto* mesh = expr.getMesh(); + ASSERT1(mesh != nullptr); + + Array data{mesh->LocalNx * mesh->LocalNy}; expr.evaluate(&data[0]); return data; } diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 2c19c5e505..e3304bd791 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -644,7 +644,10 @@ protected: template static Array evaluateBinaryExpr(const BinaryExpr& expr) { - Array data{expr.size()}; + const auto* mesh = expr.getMesh(); + ASSERT1(mesh != nullptr); + + Array data{mesh->LocalNx * mesh->LocalNy * mesh->LocalNz}; expr.evaluate(&data[0]); return data; } diff --git a/include/bout/fieldperp.hxx b/include/bout/fieldperp.hxx index 9c4a957ee8..9063b23890 100644 --- a/include/bout/fieldperp.hxx +++ b/include/bout/fieldperp.hxx @@ -342,7 +342,10 @@ public: private: template static Array evaluateBinaryExpr(const BinaryExpr& expr) { - Array data{expr.size()}; + const auto* mesh = expr.getMesh(); + ASSERT1(mesh != nullptr); + + Array data{mesh->LocalNx * mesh->LocalNz}; expr.evaluate(&data[0]); return data; } diff --git a/tests/unit/field/test_field2d.cxx b/tests/unit/field/test_field2d.cxx index af98d50d34..23eddc15ef 100644 --- a/tests/unit/field/test_field2d.cxx +++ b/tests/unit/field/test_field2d.cxx @@ -1208,6 +1208,16 @@ TEST_F(Field2DTest, AbsExpressionUsesAbsOp) { EXPECT_TRUE(IsFieldEqual(abs(expr, "RGN_ALL"), 1.0)); } +TEST_F(Field2DTest, RegionLimitedExpressionConstructsField2D) { + Field2D field; + + field = -31.0; + + Field2D result = abs(field, "RGN_NOBNDRY"); + + EXPECT_TRUE(IsFieldEqual(result, 31.0, "RGN_NOBNDRY")); +} + TEST_F(Field2DTest, Exp) { Field2D field; diff --git a/tests/unit/field/test_field3d.cxx b/tests/unit/field/test_field3d.cxx index 6e1bc8ba58..9feae8343b 100644 --- a/tests/unit/field/test_field3d.cxx +++ b/tests/unit/field/test_field3d.cxx @@ -1999,6 +1999,16 @@ TEST_F(Field3DTest, AbsExpressionUsesAbsOp) { EXPECT_TRUE(IsFieldEqual(abs(expr, "RGN_ALL"), 1.0)); } +TEST_F(Field3DTest, RegionLimitedExpressionConstructsField3D) { + Field3D field; + + field = -31.0; + + Field3D result = abs(field, "RGN_NOBNDRY"); + + EXPECT_TRUE(IsFieldEqual(result, 31.0, "RGN_NOBNDRY")); +} + TEST_F(Field3DTest, Exp) { Field3D field; diff --git a/tests/unit/field/test_fieldperp.cxx b/tests/unit/field/test_fieldperp.cxx index 4975a9e735..060355a7db 100644 --- a/tests/unit/field/test_fieldperp.cxx +++ b/tests/unit/field/test_fieldperp.cxx @@ -1593,6 +1593,17 @@ TEST_F(FieldPerpTest, Abs) { EXPECT_TRUE(IsFieldEqual(abs(field), 31.0)); } +TEST_F(FieldPerpTest, RegionLimitedExpressionConstructsFieldPerp) { + FieldPerp field; + field.setIndex(0); + + field = -31.0; + + FieldPerp result = abs(field, "RGN_NOX"); + + EXPECT_TRUE(IsFieldEqual(result, 31.0, "RGN_NOX")); +} + TEST_F(FieldPerpTest, Exp) { FieldPerp field; field.setIndex(0); From 250a566064bb2a3641a017bf9fc8de90a8e6230a Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Fri, 19 Jun 2026 11:40:50 -0700 Subject: [PATCH 50/58] Coordinates: Evaluate to FieldMetric Force evaluation of BinaryExpr by assigning to a FieldMetric. --- src/mesh/coordinates.cxx | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/mesh/coordinates.cxx b/src/mesh/coordinates.cxx index 1591cee27c..4f7a747a7b 100644 --- a/src/mesh/coordinates.cxx +++ b/src/mesh/coordinates.cxx @@ -2036,7 +2036,6 @@ const Coordinates::FieldMetric& Coordinates::g_22_yhigh() const { return *_g_22_yhigh; } _g_22_yhigh.emplace(emptyFrom(g_22)); - //_g_22_yhigh->setLocation(CELL_YHIGH); auto* mesh = Bxy.getMesh(); if (Bxy.isFci()) { if (mesh->get(_g_22_yhigh.value(), "g_22_cell_yhigh", 0.0, false) != 0) { @@ -2052,7 +2051,7 @@ const Coordinates::FieldMetric& Coordinates::g_22_yhigh() const { } void Coordinates::_compute_cell_area_x() const { - const auto area_centre = sqrt(g_22 * g_33 - SQ(g_23)) * dy * dz; + const FieldMetric area_centre = sqrt(g_22 * g_33 - SQ(g_23)) * dy * dz; _cell_area_xlow.emplace(emptyFrom(area_centre)); _cell_area_xhigh.emplace(emptyFrom(area_centre)); // We cannot setLocation, as that would trigger the computation of staggered @@ -2068,7 +2067,7 @@ void Coordinates::_compute_cell_area_x() const { void Coordinates::_compute_cell_area_y() const { auto* mesh = Bxy.getMesh(); if (g_11.isFci()) { - const auto jxz_centre = sqrt(g_11 * g_33 - SQ(g_13)); + const FieldMetric jxz_centre = sqrt(g_11 * g_33 - SQ(g_13)); auto jxz_ylow = emptyFrom(jxz_centre); auto jxz_yhigh = emptyFrom(jxz_centre); @@ -2096,7 +2095,7 @@ void Coordinates::_compute_cell_area_y() const { _cell_area_yhigh.emplace(jxz_yhigh * dx * dz); } else { // Field aligned - const auto area_centre = sqrt(g_11 * g_33 - SQ(g_13)) * dx * dz; + const FieldMetric area_centre = sqrt(g_11 * g_33 - SQ(g_13)) * dx * dz; _cell_area_ylow.emplace(emptyFrom(area_centre)); _cell_area_yhigh.emplace(emptyFrom(area_centre)); // We cannot setLocation, as that would trigger the computation of staggered @@ -2117,12 +2116,11 @@ void Coordinates::_compute_cell_area_y() const { } void Coordinates::_compute_cell_area_z() const { - const auto area_centre = sqrt(g_11 * g_22 - SQ(g_12)) * dx * dy; + const FieldMetric area_centre = sqrt(g_11 * g_22 - SQ(g_12)) * dx * dy; _cell_area_zlow.emplace(emptyFrom(area_centre)); _cell_area_zhigh.emplace(emptyFrom(area_centre)); // We cannot setLocation, as that would trigger the computation of staggered // metrics. - //ASSERT0(mesh->zstart > 0); BOUT_FOR(i, area_centre.getRegion("RGN_NOZ")) { (*_cell_area_zlow)[i] = 0.5 * (area_centre[i] + area_centre[i.zm()]); (*_cell_area_zhigh)[i] = 0.5 * (area_centre[i] + area_centre[i.zp()]); From 512e8535f82a38b155b678d47c9eed3d0ee3efea Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Fri, 19 Jun 2026 11:41:27 -0700 Subject: [PATCH 51/58] Clang tidying strLocation doesn't seem to be used and may return pointer to temporary string. Added missing headers. --- src/mesh/interpolation_xz.cxx | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/mesh/interpolation_xz.cxx b/src/mesh/interpolation_xz.cxx index ec8bcc0502..04d20769e4 100644 --- a/src/mesh/interpolation_xz.cxx +++ b/src/mesh/interpolation_xz.cxx @@ -24,16 +24,20 @@ **************************************************************************/ #include "parallel/fci_comm.hxx" +#include +#include +#include +#include #include #include +#include #include #include +#include void printLocation(const Field3D& var) { output << toString(var.getLocation()); } void printLocation(const Field2D& var) { output << toString(var.getLocation()); } -const char* strLocation(CELL_LOC loc) { return toString(loc).c_str(); } - const Field3D interpolate(const Field3D& f, const Field3D& delta_x, const Field3D& delta_z) { XZLagrange4pt interpolateMethod{f.getMesh()}; @@ -46,7 +50,7 @@ const Field3D interpolate(const Field2D& f, const Field3D& delta_x, } const Field3D interpolate(const Field2D& f, const Field3D& delta_x) { - Mesh* mesh = f.getMesh(); + const Mesh* mesh = f.getMesh(); ASSERT1(mesh == delta_x.getMesh()); Field3D result{emptyFrom(delta_x)}; From 7cb424fb0b5dc09497259bb9c8fb2c896f9492bd Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Fri, 19 Jun 2026 14:35:07 -0700 Subject: [PATCH 52/58] BinaryExpr: Assignment operators propagate metadata Assigning an expression to a field should update the field metadata (location, Y slice for FieldPerp) to the value from the BinaryExpr. Unit tests now pass locally with CHECK=4. --- include/bout/field.hxx | 34 +++++++++++++++++++++++--- include/bout/field2d.hxx | 10 +++++--- include/bout/field3d.hxx | 15 ++++++++---- include/bout/fieldops.hxx | 11 ++++++--- include/bout/fieldperp.hxx | 17 ++++++++++++- include/bout/petsc_interface.hxx | 10 ++++---- src/field/fieldperp.cxx | 2 +- src/mesh/coordinates.cxx | 4 +-- tests/unit/field/test_field2d.cxx | 19 ++++++++++++++- tests/unit/field/test_field3d.cxx | 21 +++++++++++++++- tests/unit/field/test_fieldperp.cxx | 38 ++++++++++++++++++++++++++++- 11 files changed, 153 insertions(+), 28 deletions(-) diff --git a/include/bout/field.hxx b/include/bout/field.hxx index 6547c0a31d..a8e1952546 100644 --- a/include/bout/field.hxx +++ b/include/bout/field.hxx @@ -605,6 +605,28 @@ T pow(BoutReal lhs, const T& rhs, const std::string& rgn = "RGN_ALL") { * */ class Field3DParallel; +class FieldPerp; + +namespace bout::detail { +template +std::optional getPerpYIndex(const T& value) { + if constexpr (std::is_same_v, ::FieldPerp>) { + return value.getIndex(); + } else { + return std::nullopt; + } +} + +template +std::optional getPerpYIndex(const BinaryExpr& expr) { + if constexpr (std::is_same_v) { + return expr.getIndex(); + } else { + return std::nullopt; + } +} +} // namespace bout::detail + #ifdef FIELD_FUNC #error This macro has already been defined #else @@ -640,7 +662,8 @@ class Field3DParallel; f.getLocation(), \ f.getDirections(), \ std::nullopt, \ - f.getRegion(rgn)}; \ + f.getRegion(rgn), \ + bout::detail::getPerpYIndex(f)}; \ } \ } \ template \ @@ -654,7 +677,8 @@ class Field3DParallel; f.getLocation(), \ f.getDirections(), \ f.getRegionID(), \ - f.indices}; \ + f.indices, \ + bout::detail::getPerpYIndex(f)}; \ } \ template \ inline auto name(const BinaryExpr& f, const std::string& rgn) { \ @@ -696,7 +720,8 @@ inline auto SQ(const T& f, const std::string& rgn = "RGN_ALL") { f.getLocation(), f.getDirections(), std::nullopt, - f.getRegion(rgn)}; + f.getRegion(rgn), + bout::detail::getPerpYIndex(f)}; } } @@ -711,7 +736,8 @@ inline auto SQ(const BinaryExpr& f) { f.getLocation(), f.getDirections(), f.getRegionID(), - f.indices}; + f.indices, + bout::detail::getPerpYIndex(f)}; } template diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index 807b0ac1a3..7ab8bdfb25 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -189,11 +189,15 @@ public: template std::enable_if_t, Field2D&> operator=(const BinaryExpr& expr) { - if (isAllocated()) { - expr.evaluate(&data[0]); - } else { + if (!isAllocated() || getMesh() != expr.getMesh()) { *this = Field2D{expr}; + return *this; } + + setLocation(expr.getLocation()); + setDirections(expr.getDirections()); + allocate(); + expr.evaluate(&data[0]); return *this; } diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index e3304bd791..112e73219e 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -489,13 +489,18 @@ public: Field3D& operator=(BoutReal val); template std::enable_if_t, Field3D&> - operator=(BinaryExpr& expr) { - regionID = expr.getRegionID(); - if (isAllocated()) { - expr.evaluate(&data[0]); - } else { + operator=(const BinaryExpr& expr) { + if (!isAllocated() || getMesh() != expr.getMesh()) { *this = Field3D{expr}; + return *this; } + + clearParallelSlices(); + setRegion(expr.getRegionID()); + setLocation(expr.getLocation()); + setDirections(expr.getDirections()); + allocate(); + expr.evaluate(&data[0]); return *this; } diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index df513c0290..25765e6f6b 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -21,6 +21,7 @@ class Mesh; class Field3D; class Field2D; +class FieldPerp; template struct is_expr_field2d : std::false_type {}; @@ -327,19 +328,20 @@ struct BinaryExpr { CELL_LOC location = CELL_CENTRE; DirectionTypes directions; std::optional regionID; + std::optional yindex; template BinaryExpr(const typename L::View& lhs, const typename R::View& rhs, Func f, Mesh* mesh, CELL_LOC location, DirectionTypes directions, std::optional regionID, - const Region& region) + const Region& region, std::optional yindex = std::nullopt) : lhs(lhs), rhs(rhs), indices(region.getLinearIndices()), f(f), mesh(mesh), - location(location), directions(directions), regionID(regionID) {} + location(location), directions(directions), regionID(regionID), yindex(yindex) {} BinaryExpr(const typename L::View& lhs, const typename R::View& rhs, Func f, Mesh* mesh, CELL_LOC location, DirectionTypes directions, std::optional regionID, - const Array& indices) + const Array& indices, std::optional yindex = std::nullopt) : lhs(lhs), rhs(rhs), indices(indices), f(f), mesh(mesh), location(location), - directions(directions), regionID(regionID) {} + directions(directions), regionID(regionID), yindex(yindex) {} BinaryExpr& operator=(const BinaryExpr&) = delete; BinaryExpr& operator=(BinaryExpr&&) = delete; @@ -407,6 +409,7 @@ struct BinaryExpr { CELL_LOC getLocation() const { return location; } DirectionTypes getDirections() const { return directions; } std::optional getRegionID() const { return regionID; }; + int getIndex() const { return yindex.value_or(-1); } }; #endif // BOUT_FIELDSOPS_HXX diff --git a/include/bout/fieldperp.hxx b/include/bout/fieldperp.hxx index 9063b23890..b16bc562bb 100644 --- a/include/bout/fieldperp.hxx +++ b/include/bout/fieldperp.hxx @@ -98,7 +98,7 @@ public: typename = std::enable_if_t<(is_expr_fieldperp_v && is_expr_fieldperp_v)>> FieldPerp(const BinaryExpr& expr) : FieldPerp(evaluateBinaryExpr(expr), expr.getMesh(), expr.getLocation(), - /* yindex */ -1, expr.getDirections()) {} + expr.getIndex(), expr.getDirections()) {} ~FieldPerp() override = default; @@ -108,6 +108,21 @@ public: FieldPerp& operator=(const FieldPerp& rhs); FieldPerp& operator=(FieldPerp&& rhs) = default; FieldPerp& operator=(BoutReal rhs); + template + std::enable_if_t, FieldPerp&> + operator=(const BinaryExpr& expr) { + if (!isAllocated() || getMesh() != expr.getMesh()) { + *this = FieldPerp{expr}; + return *this; + } + + setLocation(expr.getLocation()); + setDirections(expr.getDirections()); + setIndex(expr.getIndex()); + allocate(); + expr.evaluate(&data[0]); + return *this; + } /// Return a Region reference to use to iterate over this field const Region& getRegion(REGION region) const; diff --git a/include/bout/petsc_interface.hxx b/include/bout/petsc_interface.hxx index 2ce71d0549..830a7f4122 100644 --- a/include/bout/petsc_interface.hxx +++ b/include/bout/petsc_interface.hxx @@ -355,7 +355,7 @@ public: ASSERT2(positions.size() == weights.size()); #if CHECK > 2 for (const auto val : weights) { - ASSERT3(finite(val)); + ASSERT3(std::isfinite(val)); } #endif if (positions.empty()) { @@ -376,25 +376,25 @@ public: if (this == &other) { return *this; } - ASSERT3(finite(static_cast(other))); + ASSERT3(std::isfinite(static_cast(other))); *this = static_cast(other); return *this; } Element& operator=(BoutReal val) { - ASSERT3(finite(val)); + ASSERT3(std::isfinite(val)); value = val; setValues(val, INSERT_VALUES); return *this; } Element& operator+=(BoutReal val) { - ASSERT3(finite(val)); + ASSERT3(std::isfinite(val)); auto columnPosition = std::find(positions.begin(), positions.end(), petscCol); if (columnPosition != positions.end()) { const int index = std::distance(positions.begin(), columnPosition); value += weights[index] * val; - ASSERT3(finite(value)); + ASSERT3(std::isfinite(value)); } setValues(val, ADD_VALUES); return *this; diff --git a/src/field/fieldperp.cxx b/src/field/fieldperp.cxx index bf9e6b348d..72976704f2 100644 --- a/src/field/fieldperp.cxx +++ b/src/field/fieldperp.cxx @@ -2,7 +2,7 @@ * Class for 2D X-Z slices * ************************************************************************** - * Copyright 2010 - 2025 BOUT++ developers + * Copyright 2010 - 2026 BOUT++ developers * * Contact: Ben Dudson, dudson2@llnl.gov * diff --git a/src/mesh/coordinates.cxx b/src/mesh/coordinates.cxx index 4f7a747a7b..f10f903d19 100644 --- a/src/mesh/coordinates.cxx +++ b/src/mesh/coordinates.cxx @@ -1345,8 +1345,8 @@ int Coordinates::jacobian() { const bool extrapolate_x = not localmesh->sourceHasXBoundaryGuards(); const bool extrapolate_y = not localmesh->sourceHasYBoundaryGuards(); - auto g = FieldMetric{g11 * g22 * g33 + 2.0 * g12 * g13 * g23 - g11 * g23 * g23 - - g22 * g13 * g13 - g33 * g12 * g12}; + FieldMetric g = g11 * g22 * g33 + 2.0 * g12 * g13 * g23 - g11 * g23 * g23 + - g22 * g13 * g13 - g33 * g12 * g12; // Check that g is positive bout::checkPositive(g, "The determinant of g^ij", "RGN_NOBNDRY"); diff --git a/tests/unit/field/test_field2d.cxx b/tests/unit/field/test_field2d.cxx index 23eddc15ef..a91acd4a40 100644 --- a/tests/unit/field/test_field2d.cxx +++ b/tests/unit/field/test_field2d.cxx @@ -853,7 +853,7 @@ TEST_F(Field2DTest, InvalidateGuards) { sum = 0; for (const auto& i : field) { - if (!finite(field[i])) { + if (!std::isfinite(field[i])) { sum++; } } @@ -1502,6 +1502,23 @@ TEST_F(Field2DTest, OperatorEqualsField2D) { EXPECT_EQ(field.getDirectionZ(), field2.getDirectionZ()); } +TEST_F(Field2DTest, OperatorEqualsBinaryExprCopiesMetadata) { + Field2D source{ + mesh_staggered, CELL_XLOW, {YDirectionType::Aligned, ZDirectionType::Average}}; + source = 4.; + + Field2D target(mesh_staggered); + target = 0.; + + target = sqrt(source); + + EXPECT_EQ(target.getMesh(), source.getMesh()); + EXPECT_EQ(target.getLocation(), source.getLocation()); + EXPECT_EQ(target.getDirectionY(), source.getDirectionY()); + EXPECT_EQ(target.getDirectionZ(), source.getDirectionZ()); + EXPECT_TRUE(IsFieldEqual(target, 2.)); +} + TEST_F(Field2DTest, EmptyFrom) { // Create field with non-default arguments so we can check they get copied // to 'field2'. diff --git a/tests/unit/field/test_field3d.cxx b/tests/unit/field/test_field3d.cxx index 9feae8343b..905b182018 100644 --- a/tests/unit/field/test_field3d.cxx +++ b/tests/unit/field/test_field3d.cxx @@ -1211,7 +1211,7 @@ TEST_F(Field3DTest, InvalidateGuards) { sum = 0; for (const auto& i : field) { - if (!finite(field[i])) { + if (!std::isfinite(field[i])) { sum++; } } @@ -2517,6 +2517,25 @@ TEST_F(Field3DTest, OperatorEqualsField3D) { EXPECT_EQ(field.getDirectionZ(), field2.getDirectionZ()); } +TEST_F(Field3DTest, OperatorEqualsBinaryExprCopiesMetadata) { + Field3D source{ + mesh_staggered, CELL_XLOW, {YDirectionType::Aligned, ZDirectionType::Average}}; + source = 9.; + + Field3D target(mesh_staggered); + target = 0.; + target.splitParallelSlices(); + + target = sqrt(source); + + EXPECT_EQ(target.getMesh(), source.getMesh()); + EXPECT_EQ(target.getLocation(), source.getLocation()); + EXPECT_EQ(target.getDirectionY(), source.getDirectionY()); + EXPECT_EQ(target.getDirectionZ(), source.getDirectionZ()); + EXPECT_FALSE(target.hasParallelSlices()); + EXPECT_TRUE(IsFieldEqual(target, 3.)); +} + TEST_F(Field3DTest, EmptyFrom) { // Create field with non-default arguments so we can check they get copied // to 'field2'. diff --git a/tests/unit/field/test_fieldperp.cxx b/tests/unit/field/test_fieldperp.cxx index 060355a7db..46f07d589f 100644 --- a/tests/unit/field/test_fieldperp.cxx +++ b/tests/unit/field/test_fieldperp.cxx @@ -849,7 +849,7 @@ TEST_F(FieldPerpTest, InvalidateGuards) { sum = 0; for (const auto& i : field) { - if (!finite(field[i])) { + if (!std::isfinite(field[i])) { sum++; } } @@ -1784,6 +1784,42 @@ TEST_F(FieldPerpTest, OperatorEqualsFieldPerp) { EXPECT_EQ(field.getDirectionZ(), field2.getDirectionZ()); } +TEST_F(FieldPerpTest, ConstructFromBinaryExprCopiesMetadata) { + FieldPerp source{ + mesh_staggered, CELL_XLOW, 3, {YDirectionType::Aligned, ZDirectionType::Average}}; + source = 4.; + + FieldPerp result{sqrt(source)}; + + EXPECT_EQ(result.getMesh(), source.getMesh()); + EXPECT_EQ(result.getLocation(), source.getLocation()); + EXPECT_EQ(result.getIndex(), source.getIndex()); + EXPECT_EQ(result.getDirectionY(), source.getDirectionY()); + EXPECT_EQ(result.getDirectionZ(), source.getDirectionZ()); + EXPECT_TRUE(IsFieldEqual(result, 2.)); +} + +TEST_F(FieldPerpTest, OperatorEqualsBinaryExprCopiesMetadata) { + FieldPerp source{ + mesh_staggered, CELL_XLOW, 3, {YDirectionType::Aligned, ZDirectionType::Average}}; + source = 4.; + + FieldPerp target{mesh_staggered, + CELL_CENTRE, + 1, + {YDirectionType::Standard, ZDirectionType::Standard}}; + target = 0.; + + target = sqrt(source); + + EXPECT_EQ(target.getMesh(), source.getMesh()); + EXPECT_EQ(target.getLocation(), source.getLocation()); + EXPECT_EQ(target.getIndex(), source.getIndex()); + EXPECT_EQ(target.getDirectionY(), source.getDirectionY()); + EXPECT_EQ(target.getDirectionZ(), source.getDirectionZ()); + EXPECT_TRUE(IsFieldEqual(target, 2.)); +} + TEST_F(FieldPerpTest, EmptyFrom) { // Create field with non-default arguments so we can check they get copied // to 'field2'. From b0ff0c6d90675b03483da2f81b3f2c843be7dddc Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Fri, 19 Jun 2026 15:36:39 -0700 Subject: [PATCH 53/58] examples/elm-pb: Tidying and fix for BoutExpr Argument to `where` needs to be converted to a Field from a BinaryExpr. --- examples/elm-pb/elm_pb.cxx | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/examples/elm-pb/elm_pb.cxx b/examples/elm-pb/elm_pb.cxx index 9eb7396987..62cc970869 100644 --- a/examples/elm-pb/elm_pb.cxx +++ b/examples/elm-pb/elm_pb.cxx @@ -6,9 +6,14 @@ *******************************************************************************/ #include +#include +#include #include +#include #include #include +#include +#include #include #include #include @@ -16,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -246,8 +252,8 @@ class ELMpb : public PhysicsModel { std::unique_ptr phiSolver{nullptr}; std::unique_ptr aparSolver{nullptr}; - const Field2D N0tanh(BoutReal n0_height, BoutReal n0_ave, BoutReal n0_width, - BoutReal n0_center, BoutReal n0_bottom_x) { + Field2D N0tanh(BoutReal n0_height, BoutReal n0_ave, BoutReal n0_width, + BoutReal n0_center, BoutReal n0_bottom_x) { Field2D result; result.allocate(); @@ -1138,7 +1144,7 @@ class ELMpb : public PhysicsModel { // Only if not restarting: Check initial perturbation // Set U to zero where P0 < vacuum_pressure - U = where(P0 - vacuum_pressure, U, 0.0); + U = where(Field2D{P0 - vacuum_pressure}, U, 0.0); if (constn0) { ubyn = U; @@ -1202,7 +1208,7 @@ class ELMpb : public PhysicsModel { // Perform communications mesh->communicate(comms); - Coordinates* metric = mesh->getCoordinates(); + const Coordinates* metric = mesh->getCoordinates(); //////////////////////////////////////////// // Transitions from 0 in core to 1 in vacuum @@ -1698,10 +1704,10 @@ class ELMpb : public PhysicsModel { // Vacuum solution if (relax_j_vac) { // Calculate the J and Psi profile we're aiming for - Field3D Jtarget = Jpar * (1.0 - vac_mask); // Zero in vacuum + const Field3D Jtarget = Jpar * (1.0 - vac_mask); // Zero in vacuum // Invert laplacian for Psi - Field3D Psitarget = aparSolver->solve(Jtarget); + const Field3D Psitarget = aparSolver->solve(Jtarget); // Add a relaxation term in the vacuum ddt(Psi) = @@ -1832,7 +1838,7 @@ class ELMpb : public PhysicsModel { ddt(U) -= 0.5 * Upara2 * bracket(Pi0, Dperp2Phi, bm_exb) / B0; Field3D B0phi = B0 * phi; mesh->communicate(B0phi); - Field3D B0phi0 = B0 * phi0; + Field2D B0phi0 = B0 * phi0; mesh->communicate(B0phi0); ddt(U) += 0.5 * Upara2 * bracket(B0phi, Dperp2Pi0, bm_exb) / B0; ddt(U) += 0.5 * Upara2 * bracket(B0phi0, Dperp2Pi, bm_exb) / B0; From 404e063674e8e95ffb71e08d6b336176a4e7f776 Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Fri, 19 Jun 2026 15:41:07 -0700 Subject: [PATCH 54/58] Updating documentation Document field expressions and GPU support changes. --- manual/sphinx/developer_docs/data_types.rst | 129 +++++++++----- manual/sphinx/index.rst | 29 ++-- .../sphinx/user_docs/algebraic_operators.rst | 74 ++++++--- manual/sphinx/user_docs/field_expressions.rst | 151 +++++++++++++++++ manual/sphinx/user_docs/gpu_support.rst | 157 ++++++++++++------ .../sphinx/user_docs/parallel-transforms.rst | 15 ++ 6 files changed, 432 insertions(+), 123 deletions(-) create mode 100644 manual/sphinx/user_docs/field_expressions.rst diff --git a/manual/sphinx/developer_docs/data_types.rst b/manual/sphinx/developer_docs/data_types.rst index f9411dcb39..7feb3945aa 100644 --- a/manual/sphinx/developer_docs/data_types.rst +++ b/manual/sphinx/developer_docs/data_types.rst @@ -280,7 +280,7 @@ The region to iterate over can be over ``Field2D``, ``Field3D``, or - `RGN_NOY`, which skips the y boundaries and guard cells New regions can be created and modified, see section below. - + A standard C++ range for loop can also be used, but this is unlikely to OpenMP parallelise or vectorise:: @@ -306,7 +306,7 @@ For loops inside parallel regions, there is ``BOUT_FOR_INNER``:: } ... } - + If a more general OpenMP directive is needed, there is ``BOUT_FOR_OMP``:: @@ -314,7 +314,7 @@ If a more general OpenMP directive is needed, there is BOUT_FOR_OMP(i, region, parallel for reduction(max:result)) { result = f[i] > result ? f[i] : result; } - + The iterator provides access to the x, y, z indices:: Field3D f(0.0); @@ -385,14 +385,14 @@ good performance on typical x86_64 hardware. Some simple diagnostics are printed at the start of the BOUT++ output which may help. For example the ``blob2d`` example prints:: - Registered region 3D RGN_ALL: - Total blocks : 1040, min(count)/max(count) : 64 (1040)/ 64 (1040), Max imbalance : 1, Small block count : 0 + Registered region 3D RGN_ALL: + Total blocks : 1040, min(count)/max(count) : 64 (1040)/ 64 (1040), Max imbalance : 1, Small block count : 0 In this case all blocks are the same size, so the ``Max imbalance`` (ratio of maximum to minimum block size) is 1. The ``Small block count`` is currently defined as the number of blocks with a size less than half the maximum block size. Ideally all blocks should be a -similar size, so that work is evenly balanced between threads. +similar size, so that work is evenly balanced between threads. Creating new regions ~~~~~~~~~~~~~~~~~~~~ @@ -422,7 +422,7 @@ in the mask (i.e. set subtraction):: or:: auto region = mask(mesh->getRegion2D("RGN_ALL"), mesh->getRegion2D("RGN_GUARDS")); - + The above example would produce a region containing all the indices in ``RGN_ALL`` which are not in ``RGN_GUARDS``. @@ -444,7 +444,7 @@ In the current implementation overwriting a region, by attempting to add a region which already exists, is not allowed, and will result in a ``BoutException`` being thrown. This restriction may be removed in future. - + .. _sec-rangeiterator: Iterating over ranges @@ -493,33 +493,73 @@ initialised in the constructor. .. _sec-fieldops: -Field2D/Field3D Arithmetic Operators ------------------------------------- - -The arithmetic operators (``+``, ``-``, ``/``, ``*``) for `Field2D` -and `Field3D` are generated automatically using the `Jinja`_ -templating system. This requires Python 3 (2.7 may work, but only 3 is -supported). - -Because this is fairly low-level code, and we don't expect it to -change very much, the generated code is kept in the git -repository. This has the benefit that Python and Jinja are not needed -to build BOUT++, only to change the ``Field`` operator code. - -.. warning:: You should not modify the generated code - directly. Instead, modify the template and re-generate - the code. If you commit changes to the template and/or - driver, make sure to re-generate the code and commit it - as well - -The Jinja template is in ``src/field/gen_fieldops.jinja``, and the -driver is ``src/field/gen_fieldops.py``. The driver loops over every -combination of `BoutReal`, `Field2D`, `Field3D` (collectively just -"fields" here) with the arithmetic operators, and uses the template to -generate the appropriate code. There is some logic in the template to -handle certain combinations of the input fields: for example, for the -binary infix operators, only check the two arguments are on identical -meshes if neither is `BoutReal`. +Field expressions and generated operators +----------------------------------------- + +At user level, field algebra now looks more uniform than it used to: +ordinary arithmetic and many unary algebraic operators can be combined +into lazy expressions and only materialized when a concrete field or +scalar result is needed. + +This implementation is split into two layers. + +``BinaryExpr`` and views +~~~~~~~~~~~~~~~~~~~~~~~~ + +The lazy-expression layer lives in ``include/bout/fieldops.hxx``. The +central type is ``BinaryExpr``, which stores: + +- views of the left and right expression operands +- the operation functor +- mesh and metadata needed to check compatibility and materialize the + result +- a cached list of linear region indices describing where the + expression is valid + +`Field2D`, `Field3D`, and `FieldPerp` act as expression leaves by +providing lightweight ``View`` types. Those views are the device- and +backend-friendly objects used by the expression evaluator. + +Materialization happens when a field is constructed or assigned from an +expression, when an expression is stored in `Options`, or when a scalar +reduction such as ``min`` or ``mean`` is requested. The same mechanism +is also used to propagate metadata such as mesh, staggered location, +directions, and `FieldPerp` y-index. + +The unary algebraic helpers in ``include/bout/field.hxx`` build on the +same mechanism. Functions such as ``sqrt``, ``abs``, ``SQ``, +``if_else``, ``if_else_zero``, ``min``, ``max``, and ``mean`` can all +operate directly on lazy expressions. + +Generated eager operators +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The eager arithmetic operators and in-place update paths are still +generated automatically using the `Jinja`_ templating system. The main +files are: + +- ``src/field/gen_fieldops.jinja`` +- ``src/field/gen_fieldops.py`` +- ``src/field/generated_fieldops.cxx`` + +The generated code handles the broad matrix of combinations between +`BoutReal`, `Field2D`, `Field3D`, `Field3DParallel`, and `FieldPerp`, +including several mixed-rank and in-place cases where hand-maintaining +all overloads would be error-prone. + +The generated loops now also depend on the configured execution backend. +At configure time, the generator is told whether to emit RAJA-based, +OpenMP-based, or serial loop bodies for the eager paths. + +Because this is low-level code, the generated source is kept in the git +repository. Python and Jinja are therefore only needed when changing the +operator generator itself, not for an ordinary build. + +.. warning:: + + Do not edit ``generated_fieldops.cxx`` directly. Instead, modify the + template or generator, then regenerate the file and commit both the + source change and the regenerated output. To install Jinja: @@ -527,16 +567,17 @@ To install Jinja: $ pip3 install --user Jinja2 -To re-generate the code, there is a ``make`` target for -``gen_fieldops.cxx`` in ``src/field/makefile``. This also tries to -apply ``clang-format`` in order to keep to a consistent code style. +To regenerate the code, use the target for ``gen_fieldops.cxx`` in +``src/field/makefile`` or the corresponding CMake-driven generation +path. This also applies ``clang-format`` to keep the output consistent. -.. note:: ``clang-format`` is bundled with ``clang``. This should be - available through your system package manager. If you do not - have sufficient privileges on your system, you can install - it from the source `clang`_. One of the BOUT++ maintainers - can help apply it for you too. +.. note:: + + ``clang-format`` is bundled with ``clang``. This should be available + through your system package manager. If you do not have sufficient + privileges on your system, you can install it from the source + `clang`_. One of the BOUT++ maintainers can also help apply it for + you. .. _Jinja: http://jinja.pocoo.org/ .. _clang: https://clang.llvm.org/ - diff --git a/manual/sphinx/index.rst b/manual/sphinx/index.rst index 9408d05057..6a5a48ba34 100644 --- a/manual/sphinx/index.rst +++ b/manual/sphinx/index.rst @@ -15,10 +15,12 @@ The documentation is divided into the following sections: * :ref:`model-outputs` * :ref:`bout-interfaces` - + + * :ref:`performance-and-accelerators` + * :ref:`developer-docs` - + .. toctree:: :maxdepth: 2 :caption: Getting started @@ -30,25 +32,24 @@ The documentation is divided into the following sections: user_docs/advanced_install user_docs/running_bout user_docs/new_in_v5 - + .. toctree:: :maxdepth: 2 :caption: BOUT++ models :name: bout-models - + user_docs/physics_models user_docs/makefiles user_docs/variable_init user_docs/boundary_options user_docs/testing - user_docs/gpu_support user_docs/adios2 - + .. toctree:: :maxdepth: 2 :caption: Model inputs :name: model-inputs - + user_docs/bout_options user_docs/input_grids @@ -56,19 +57,20 @@ The documentation is divided into the following sections: :maxdepth: 2 :caption: Model outputs :name: model-outputs - + user_docs/output_and_post user_docs/python_boutpp - + .. toctree:: :maxdepth: 2 :caption: BOUT++ interfaces :name: bout-interfaces - + user_docs/time_integration user_docs/parallel-transforms user_docs/laplacian user_docs/differential_operators + user_docs/field_expressions user_docs/algebraic_operators user_docs/staggered_grids user_docs/eigenvalue_solver @@ -76,6 +78,13 @@ The documentation is divided into the following sections: user_docs/invertable_operator user_docs/petsc +.. toctree:: + :maxdepth: 2 + :caption: Performance and accelerators + :name: performance-and-accelerators + + user_docs/gpu_support + .. toctree:: :maxdepth: 1 :caption: Field-aligned coordinate systems diff --git a/manual/sphinx/user_docs/algebraic_operators.rst b/manual/sphinx/user_docs/algebraic_operators.rst index b2089f9ec3..b8c40d4dc5 100644 --- a/manual/sphinx/user_docs/algebraic_operators.rst +++ b/manual/sphinx/user_docs/algebraic_operators.rst @@ -1,28 +1,39 @@ .. _sec-algebraic-ops: Algebraic operators -========================= +=================== BOUT++ provides a wide variety of algebraic operators acting on fields. -The algebraic operators are listed in :numref:`tab-algebraic-ops`. -For a completely up-to-date list, see the ``Non-member functions`` -part of :doc:`field2d.hxx<../_breathe_autogen/file/field2d_8hxx>`, -:doc:`field3d.hxx<../_breathe_autogen/file/field3d_8hxx>`, +Most of these operators can participate in the lazy field-expression +system described in :ref:`sec-field-expressions`. In practice this means +you can usually write ordinary algebraic code and let BOUT++ delay +evaluation until assignment or reduction. + +For a completely up-to-date list, see the ``Non-member functions`` part +of :doc:`field2d.hxx<../_breathe_autogen/file/field2d_8hxx>`, +:doc:`field3d.hxx<../_breathe_autogen/file/field3d_8hxx>`, and :doc:`fieldperp.hxx<../_breathe_autogen/file/fieldperp_8hxx>`. +Common operators +---------------- + .. _tab-algebraic-ops: .. table:: Algebraic operators - +------------------------------------------+------------------------------------------------------+ - | Name | Description | + +------------------------------------------+------------------------------------------------------+ + | Name | Description | +==========================================+======================================================+ - | ``min(f, allpe=true, region)`` | Minimum (optionally over all processes) | + | ``min(f, allpe=true, region)`` | Minimum (optionally over all processes) | +------------------------------------------+------------------------------------------------------+ | ``max(f, allpe=true, region)`` | Maximum (optionally over all processes) | +------------------------------------------+------------------------------------------------------+ + | ``mean(f, allpe=true, region)`` | Mean (optionally over all processes) | + +------------------------------------------+------------------------------------------------------+ | ``pow(lhs, rhs, region)`` | :math:`\mathtt{lhs}^\mathtt{rhs}` | +------------------------------------------+------------------------------------------------------+ + | ``SQ(f, region)`` | Square of ``f`` | + +------------------------------------------+------------------------------------------------------+ | ``sqrt(f, region)`` | :math:`\sqrt{(f)}` | +------------------------------------------+------------------------------------------------------+ | ``abs(f, region)`` | :math:`|f|` | @@ -65,24 +76,43 @@ part of :doc:`field2d.hxx<../_breathe_autogen/file/field2d_8hxx>`, | | of `f` as opposed to the AC, alternating current, or | | | fluctuating part) | +------------------------------------------+------------------------------------------------------+ + | ``if_else(cond, lhs, rhs)`` | Select between two algebraic branches | + +------------------------------------------+------------------------------------------------------+ + | ``if_else_zero(cond, expr)`` | Select either ``expr`` or zero | + +------------------------------------------+------------------------------------------------------+ + +These operators can usually be combined directly in expressions:: + + Field3D rhs = sqrt(SQ(n) + SQ(T)); + Field3D masked = if_else(use_drive, source * profile, sink * profile); + BoutReal max_error = max(abs(lhs - rhs), true); + +Reductions such as ``min``, ``max``, and ``mean`` can operate directly +on an expression, so an intermediate field is often unnecessary. + +Region arguments +---------------- -These operators take a ``region`` argument, whose values can be [#]_ (see -:ref:`sec-iterating`) +These operators take a ``region`` argument. Common values are [#]_ (see +:ref:`sec-iterating`): -- `RGN_ALL`, which is the whole mesh; +- ``RGN_ALL``, which is the whole mesh +- ``RGN_NOBNDRY``, which skips all boundaries +- ``RGN_NOX``, which skips the x boundaries +- ``RGN_NOY``, which skips the y boundaries -- `RGN_NOBNDRY`, which skips all boundaries; +The default is usually ``RGN_ALL``. Restricting the region can improve +performance when guard-cell values will not be used. -- `RGN_NOX`, which skips the x boundaries +When a region-limited expression is materialized into a field, only the +selected region is guaranteed to contain valid values. This is the same +performance-oriented convention used by other field operators. -- `RGN_NOY`, which skips the y boundaries +Further reading +--------------- -The default value for the region argument is `RGN_ALL` which should work in all -cases. However, the region argument can be used for optimization, to skip -calculations in guard cells if it is known that those results will not be -needed (for example, if no derivatives of the result will be calculated). Since -these operators can be relatively expensive compared to addition, subtraction, -multiplication this can be a useful performance improvement. +- :ref:`sec-field-expressions` +- :ref:`sec-gpusupport` -.. [#] More regions may be added in future, for example to act on only subsets of the - physical domain. +.. [#] More regions may be added in future, for example to act on only + subsets of the physical domain. diff --git a/manual/sphinx/user_docs/field_expressions.rst b/manual/sphinx/user_docs/field_expressions.rst new file mode 100644 index 0000000000..62a50988d6 --- /dev/null +++ b/manual/sphinx/user_docs/field_expressions.rst @@ -0,0 +1,151 @@ +.. _sec-field-expressions: + +Field Expressions +================= + +BOUT++ field algebra now supports *lazy expressions* for many common +operations. Instead of creating a temporary field for every ``+``, +``-``, ``*``, ``/``, ``sqrt`` or ``abs``, BOUT++ can keep the expression +symbolic and evaluate it only when a concrete field or scalar result is +needed. + +This keeps ordinary model code readable while reducing temporary +allocations and extra loops over the mesh. It is especially helpful for +accelerator backends, where launching fewer kernels matters. + +What stays lazy +--------------- + +The following operations can form lazy expressions over `Field2D`, +`Field3D`, `Field3DParallel`, and `FieldPerp` where the combination makes +sense: + +- Arithmetic operators: ``+``, ``-``, ``*``, ``/`` +- Unary algebraic operators such as ``sqrt``, ``abs``, ``exp``, ``log``, + ``sin``, ``cos``, ``tan``, ``sinh``, ``cosh``, ``tanh``, ``floor``, + and ``SQ`` +- Simple conditionals with ``if_else`` and ``if_else_zero`` +- Reductions such as ``min``, ``max``, and ``mean`` + +For example:: + + Field3D n, T; + Field3D result; + + result = sqrt(SQ(n) + SQ(T)); + +The right-hand side can stay lazy until the assignment to ``result``. + +When evaluation happens +----------------------- + +An expression is evaluated when BOUT++ needs actual storage or a scalar +answer. Common triggers are: + +- assigning to a field +- constructing a field from an expression +- assigning a field expression into an `Options` object +- calling scalar reductions such as ``min``, ``max``, or ``mean`` + +Examples:: + + Field3D result = n + T; + options["rhs"] = n + T; + BoutReal max_value = max(abs(n + T), true); + +Region-limited expressions +-------------------------- + +Many algebraic operators take a ``region`` argument, usually defaulting +to ``RGN_ALL``. A lazy expression keeps track of that region. + +Only values inside the requested region are guaranteed to be valid after +materialization. This is useful for skipping guard-cell work when the +result will only be used in a smaller region:: + + Field3D interior = abs(n, "RGN_NOBNDRY"); + +As with other region-limited field operations in BOUT++, code that later +uses guard cells should communicate or otherwise fill those cells before +relying on them. + +Metadata propagation +-------------------- + +When an expression is materialized into a field, BOUT++ propagates the +field metadata carried by the expression: + +- mesh pointer +- cell location +- field directions +- for `FieldPerp`, the y-index + +This means expressions are intended to behave like ordinary field +operations in user code. Compatibility checks still apply: combining +fields on different meshes or incompatible staggered locations is an +error. + +Mixed field types +----------------- + +Several mixed-type combinations are supported directly: + +- `Field2D` with `Field3D`: the 2D quantity is broadcast in ``z`` +- `FieldPerp` with matching perpendicular data: the operation uses the + `FieldPerp` y-index +- expressions involving metric components may return + `Coordinates::FieldMetric`, which is `Field2D` or `Field3D` depending + on how BOUT++ was built + +In practice, this means code such as:: + + Coordinates::FieldMetric grad = coords->J / coords->g_22; + Field3D rhs = density * temperature + background_2d; + +can use the same algebraic style even when metric dimensionality or +field rank differs. + +Conditionals +------------ + +``if_else`` selects between two algebraic branches without forcing the +branches to be precomputed:: + + Field3D rhs = if_else(use_source, source * density, sink * density); + +``if_else_zero(condition, expr)`` is a shorthand for selecting either an +expression or zero:: + + Field3D rhs = if_else_zero(include_drive, drive * profile); + +This is particularly convenient when optional source terms are enabled +or disabled by compile-time or run-time logic. + +Reductions on expressions +------------------------- + +Reductions can operate directly on expressions instead of requiring an +intermediate field:: + + BoutReal rms = sqrt(mean(SQ(n - n0), true, "RGN_NOBNDRY")); + BoutReal max_error = max(abs(lhs - rhs), true); + +This is often clearer than explicitly constructing a temporary field, +and it avoids extra storage. + +Relation to GPU execution +------------------------- + +Lazy field expressions are the high-level path to reducing temporary +work. They are a good default when ordinary field algebra expresses the +operation clearly. + +For more control, especially when you want to fuse derivative operators +into a single explicit loop, see :ref:`sec-gpusupport`. + +See also +-------- + +- :doc:`algebraic_operators` +- :doc:`gpu_support` +- :doc:`differential_operators` diff --git a/manual/sphinx/user_docs/gpu_support.rst b/manual/sphinx/user_docs/gpu_support.rst index cc0cba8def..670c1a9b32 100644 --- a/manual/sphinx/user_docs/gpu_support.rst +++ b/manual/sphinx/user_docs/gpu_support.rst @@ -3,68 +3,92 @@ GPU support =========== -This section describes work in progress to develop GPU support in -BOUT++ models. It includes both configuration and compilation on GPU -systems, but also ways to write physics models which are designed to -give higher performance. These methods may also be beneficial for CPU -architectures, but have fewer safety checks, less functionality and -run-time flexibility than the field operators. +This section describes the main ways to run BOUT++ work efficiently on +GPUs or other accelerator-style backends. -To use the single index operators and the ``BOUT_FOR_RAJA`` loop macro:: +There are now two complementary levels of optimization: + +1. Write ordinary field algebra and let BOUT++ keep many algebraic + expressions lazy until assignment or reduction. +2. Drop down to explicit `RAJA` loops and single-index operators when + you want complete control over loop fusion and kernel structure. + +The first approach is usually the best starting point. The second is for +hot loops where you want to manually combine derivative operators, +accessors, and run-time captures in one kernel. + +Automatic fusion with field expressions +--------------------------------------- + +Many algebraic operations on fields can now be represented as lazy +expressions. This keeps user code close to the familiar field-based +style while reducing temporary fields and extra passes over memory. + +Typical examples are: + +.. code-block:: cpp + + Field3D rhs = sqrt(SQ(n) + SQ(T)); + ddt(n) = source * profile - sink * n; + BoutReal max_error = max(abs(lhs - rhs), true); + +This is the highest-level route to better execution behavior, and it is +usually the most maintainable. See :ref:`sec-field-expressions` for the +details of what stays lazy and when evaluation happens. + +Lazy expressions mainly help with *algebraic* fusion. If your hot path +is dominated by differential operators and you need to fuse those +operators into a single explicit loop, use the lower-level approach +described below. + +Manual fusion with RAJA loops +----------------------------- + +To use the single-index operators and the ``BOUT_FOR_RAJA`` loop macro:: #include "bout/single_index_ops.hxx" #include "bout/rajalib.hxx" -To run parts of a physics model RHS function on a GPU, the basic -outline of the code is to (optionally) first copy any class member -variables which will be used in the loop into local variables -(see below for an alternative method):: +To run part of a physics-model RHS on a GPU, start by copying any class +member variables needed inside the loop into local variables, or capture +them explicitly:: - auto _setting = setting; // Create a local variable to capture + auto _setting = setting; -Then create a `FieldAccessor` to efficiently access field and -coordinate system data inside the loop:: +Then create `FieldAccessor` objects to read and write field data inside +the loop:: auto n_acc = FieldAccessor<>(n); auto phi_acc = FieldAccessor<>(phi); -There are also ``Field2DAccessor``s for accessing ``Field2D`` -types. If fields are staggered, then the expected location should be -passed as a template parameter:: +There are also ``Field2DAccessor`` objects for `Field2D`. If fields are +staggered, the expected location can be supplied as a template +parameter:: auto Jpar_acc = FieldAccessor(Jpar); -which enables the cell location to be checked in the operators at -compile time rather than run time. +Finally the loop itself can be written as:: -Finally the loop itself can be written something like:: + Field3D result; + auto result_acc = FieldAccessor<>(result); BOUT_FOR_RAJA(i, region) { - ddt(n_acc)[i] = -bracket(phi_acc, n_acc, i) - 2 * DDZ(n_acc, i); - /* ... */ + result_acc[i] = -bracket(phi_acc, n_acc, i) - 2.0 * DDZ(n_acc, i); }; Note the semicolon after the closing brace, which is needed because -this is the body of a lambda function. Inside the body of the loop, -the operators like ``bracket`` and ``DDZ`` calculate the derivatives -at a single index ``i``. These are "single index operators` and are -defined in ``bout/single_index_ops.hxx``. - -Any class member variables which are used inside the loop must be captured -as a local variable. If this is not done, then the code will probably compile, -but may produce an illegal memory access error at runtime on the GPU. To -capture the class member, you can copy any class member variables which -will be used in the loop into local variables:: +this is the body of a lambda function. Inside the loop, operators such +as ``bracket`` and ``DDZ`` act at a single index ``i``. These are the +single-index operators defined in ``bout/single_index_ops.hxx``. - auto _setting = setting; // Create a local variable to capture - -and then use ``_setting`` rather than ``setting`` inside the loop. -Alternatively, add variables to be captured to a CAPTURE argument to -the ``BOUT_FOR_RAJA`` loop:: +Any class member variables used inside the loop must be captured +carefully. Otherwise the code may compile but fail at run time on the +GPU. Instead of using ``this`` implicitly, either shadow members with +local variables or add them to the capture list:: BOUT_FOR_RAJA(i, region, CAPTURE(setting)) { ddt(n_acc)[i] = -bracket(phi_acc, n_acc, i) - 2 * DDZ(n_acc, i); - /* ... code which uses `setting` ... */ + /* ... code that uses `setting` ... */ }; If RAJA is not available, the ``BOUT_FOR_RAJA`` macro will revert to @@ -75,10 +99,26 @@ Note: An important difference between ``BOUT_FOR`` and ``BOUT_FOR_RAJA`` (apart from the closing semicolon) is that the type of the index ``i`` is different inside the loop: ``BOUT_FOR`` uses ``SpecificInd`` types (typically ``Ind3D``), but ``BOUT_FOR_RAJA`` -uses ``int``. ``SpecificInd`` can be explicitly cast to ``int`` so +uses ``int``. ``SpecificInd`` can be explicitly cast to ``int`` so use ``static_cast(i)`` to ensure that it's an integer both with and without RAJA. This might (hopefully) change in future versions. +Choosing between the two approaches +----------------------------------- + +Use lazy field expressions when: + +- the code is mostly algebraic combinations of existing fields +- readability matters more than extracting the last bit of performance +- you want a clear default path that still maps well to accelerator + backends + +Use explicit RAJA loops and single-index operators when: + +- a hot loop is dominated by derivatives +- you want to combine many operations into one kernel manually +- you need direct control over captures, data access, or loop structure + Examples -------- @@ -115,8 +155,12 @@ Notes: CMake configuration ------------------- -To compile BOUT++ components into GPU kernels a few different pieces need to be configured to work together: -RAJA, Umpire, and a CUDA compiler. +To compile BOUT++ components into GPU kernels, a few different pieces +need to work together: RAJA, Umpire, and a CUDA-capable compiler. + +The generated eager field-operator code also selects a loop backend at +configure time. If RAJA is enabled it uses RAJA loops, otherwise it +falls back to OpenMP or serial loops depending on the build. .. _tab-gpusupport-cmake: @@ -136,6 +180,25 @@ RAJA, Umpire, and a CUDA compiler. | BOUT_ENABLE_WARNINGS | nvcc has incompatible warning flags | On (turn Off for CUDA) | +----------------------+-----------------------------------------+------------------------+ +Shifted metric on GPUs +---------------------- + +When BOUT++ is built with CUDA, the shifted-metric parallel transform +has a CUDA implementation of its toroidal ``shiftZ`` work used while +calculating parallel slices during communication. + +This is most relevant when using: + +.. code-block:: cfg + + [mesh:paralleltransform] + type = shifted + calcParallelSlices_on_communicate = true + +The current implementation is specialized for supported power-of-two +``LocalNz`` values. If parallel slices are disabled on communicate, as in +the aligned-transform workflow, this precomputed-slice path is not used. + Single index operators ---------------------- @@ -263,7 +326,7 @@ likely that the results might be architecture dependent. To minimise the number of times this data needs to be copied from individual fields into the single array, and then copied from CPU to -GPU, ``CoordinatesAccessor``s are cached. A map (``coords_store`` +GPU, ``CoordinatesAccessor``\ s are cached. A map (``coords_store`` defined in ``coordinates_accessor.cxx``) associates ``Array`` objects (containing the array of data) to ``Coordinates`` pointers. If a ``CoordinatesAccessor`` is constructed @@ -314,10 +377,10 @@ This is a `good talk by John Lakos [ACCU 2017] on memory allocators Future work ----------- -Indices -~~~~~~~ - -Setting up a RAJA loop to run on a GPU is still cumbersome and inefficient +The GPU path is still evolving. The main long-term direction is to let +more of ordinary field code map efficiently onto accelerator backends, +so that manual kernel construction is only needed for the most +performance-critical cases. due to the need to transform CPU data structures into a form which can be passed to and used on the GPU. In the ``bout/rajalib.hxx`` header there is code like:: @@ -332,7 +395,7 @@ is code like:: auto _ob_i_ind_raw = &_ob_i_ind[0]; which is creating a raw pointer (``_ob_i_ind_raw``) to an array of -``int``s which are allocated using Umpire. The original ``indices`` +``int``\ s which are allocated using Umpire. The original ``indices`` are allocated using ``new`` and are inside a C++ ``std::vector``. The RAJA loop then uses this array like this:: diff --git a/manual/sphinx/user_docs/parallel-transforms.rst b/manual/sphinx/user_docs/parallel-transforms.rst index 3ee3eccfb8..9d9b94af9e 100644 --- a/manual/sphinx/user_docs/parallel-transforms.rst +++ b/manual/sphinx/user_docs/parallel-transforms.rst @@ -120,6 +120,21 @@ Note that here :math:`\theta_0` does not need to be constant in X (radius), since it is only the relative shifts between Y locations which matters. +When BOUT++ is built with CUDA, the shifted-metric implementation also +has a GPU path for the ``shiftZ`` work used to calculate parallel +slices during communication. This is most useful in the standard +shifted-metric workflow with + +.. code-block:: cfg + + [mesh:paralleltransform] + type = shifted + calcParallelSlices_on_communicate = true + +If ``calcParallelSlices_on_communicate = false`` is used, BOUT++ is in +the aligned-transform mode described below, so those precomputed +parallel slices are not generated on communicate. + Special handling is needed for parallel boundary conditions, see :ref:`sec-parallel-bc-shifted-metric`. From c88908b2bfad108656b3d139435b705219adface Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Fri, 19 Jun 2026 15:55:45 -0700 Subject: [PATCH 55/58] Remove unneeded BinaryExpr evaluations max(abs(BinaryExpr)) will now perform the reduction without allocating and filling a field. --- src/field/fieldperp.cxx | 2 +- src/mesh/coordinates.cxx | 28 ++++++++++++++-------------- src/mesh/coordinates_accessor.cxx | 4 ++-- tests/unit/test_extras.hxx | 1 + 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/src/field/fieldperp.cxx b/src/field/fieldperp.cxx index 72976704f2..5176a79c64 100644 --- a/src/field/fieldperp.cxx +++ b/src/field/fieldperp.cxx @@ -209,7 +209,7 @@ bool operator==(const FieldPerp& a, const FieldPerp& b) { if (!a.isAllocated() || !b.isAllocated()) { return false; } - return (a.getIndex() == b.getIndex()) and (min(FieldPerp{abs(a - b)}) < 1e-10); + return (a.getIndex() == b.getIndex()) and (min(abs(a - b)) < 1e-10); } std::ostream& operator<<(std::ostream& out, const FieldPerp& value) { diff --git a/src/mesh/coordinates.cxx b/src/mesh/coordinates.cxx index f10f903d19..057ffa65b5 100644 --- a/src/mesh/coordinates.cxx +++ b/src/mesh/coordinates.cxx @@ -1269,15 +1269,15 @@ int Coordinates::calcCovariant(const std::string& region) { } BoutReal maxerr; - maxerr = BOUTMAX(max(abs(FieldMetric{(g_11 * g11 + g_12 * g12 + g_13 * g13) - 1})), - max(abs(FieldMetric{(g_12 * g12 + g_22 * g22 + g_23 * g23) - 1})), - max(abs(FieldMetric{(g_13 * g13 + g_23 * g23 + g_33 * g33) - 1}))); + maxerr = BOUTMAX(max(abs((g_11 * g11 + g_12 * g12 + g_13 * g13) - 1)), + max(abs((g_12 * g12 + g_22 * g22 + g_23 * g23) - 1)), + max(abs((g_13 * g13 + g_23 * g23 + g_33 * g33) - 1))); output_info.write("\tLocal maximum error in diagonal inversion is {:e}\n", maxerr); - maxerr = BOUTMAX(max(abs(FieldMetric{g_11 * g12 + g_12 * g22 + g_13 * g23})), - max(abs(FieldMetric{g_11 * g13 + g_12 * g23 + g_13 * g33})), - max(abs(FieldMetric{g_12 * g13 + g_22 * g23 + g_23 * g33}))); + maxerr = BOUTMAX(max(abs(g_11 * g12 + g_12 * g22 + g_13 * g23)), + max(abs(g_11 * g13 + g_12 * g23 + g_13 * g33)), + max(abs(g_12 * g13 + g_22 * g23 + g_23 * g33))); output_info.write("\tLocal maximum error in off-diagonal inversion is {:e}\n", maxerr); @@ -1324,15 +1324,15 @@ int Coordinates::calcContravariant(const std::string& region) { } BoutReal maxerr; - maxerr = BOUTMAX(max(abs(FieldMetric{(g_11 * g11 + g_12 * g12 + g_13 * g13) - 1})), - max(abs(FieldMetric{(g_12 * g12 + g_22 * g22 + g_23 * g23) - 1})), - max(abs(FieldMetric{(g_13 * g13 + g_23 * g23 + g_33 * g33) - 1}))); + maxerr = BOUTMAX(max(abs((g_11 * g11 + g_12 * g12 + g_13 * g13) - 1)), + max(abs((g_12 * g12 + g_22 * g22 + g_23 * g23) - 1)), + max(abs((g_13 * g13 + g_23 * g23 + g_33 * g33) - 1))); output_info.write("\tMaximum error in diagonal inversion is {:e}\n", maxerr); - maxerr = BOUTMAX(max(abs(FieldMetric{g_11 * g12 + g_12 * g22 + g_13 * g23})), - max(abs(FieldMetric{g_11 * g13 + g_12 * g23 + g_13 * g33})), - max(abs(FieldMetric{g_12 * g13 + g_22 * g23 + g_23 * g33}))); + maxerr = BOUTMAX(max(abs(g_11 * g12 + g_12 * g22 + g_13 * g23)), + max(abs(g_11 * g13 + g_12 * g23 + g_13 * g33)), + max(abs(g_12 * g13 + g_22 * g23 + g_23 * g33))); output_info.write("\tMaximum error in off-diagonal inversion is {:e}\n", maxerr); return 0; @@ -1345,8 +1345,8 @@ int Coordinates::jacobian() { const bool extrapolate_x = not localmesh->sourceHasXBoundaryGuards(); const bool extrapolate_y = not localmesh->sourceHasYBoundaryGuards(); - FieldMetric g = g11 * g22 * g33 + 2.0 * g12 * g13 * g23 - g11 * g23 * g23 - - g22 * g13 * g13 - g33 * g12 * g12; + const FieldMetric g = g11 * g22 * g33 + 2.0 * g12 * g13 * g23 - g11 * g23 * g23 + - g22 * g13 * g13 - g33 * g12 * g12; // Check that g is positive bout::checkPositive(g, "The determinant of g^ij", "RGN_NOBNDRY"); diff --git a/src/mesh/coordinates_accessor.cxx b/src/mesh/coordinates_accessor.cxx index d28b0db1c4..efc27e9715 100644 --- a/src/mesh/coordinates_accessor.cxx +++ b/src/mesh/coordinates_accessor.cxx @@ -1,6 +1,6 @@ #include "bout/coordinates_accessor.hxx" -#include - +#include "bout/build_defines.hxx" +#include "bout/macro_for_each.hxx" #include "bout/mesh.hxx" #include diff --git a/tests/unit/test_extras.hxx b/tests/unit/test_extras.hxx index 6038d981d4..4b2226a54c 100644 --- a/tests/unit/test_extras.hxx +++ b/tests/unit/test_extras.hxx @@ -14,6 +14,7 @@ #include "bout/field.hxx" #include "bout/field2d.hxx" #include "bout/field3d.hxx" +#include "bout/fieldops.hxx" #include "bout/fieldperp.hxx" #include "bout/region.hxx" From ca94ba5144a307e29fb7638761c1e5f1076633bf Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Fri, 19 Jun 2026 21:50:15 -0700 Subject: [PATCH 56/58] Fields: Move unary minus to headers Enables inlining, and should now return a lazy BinaryExpr rather than eagerly evaluating. --- include/bout/field2d.hxx | 2 +- include/bout/field3d.hxx | 2 +- include/bout/fieldperp.hxx | 2 +- src/field/field2d.cxx | 3 --- src/field/field3d.cxx | 2 -- src/field/fieldperp.cxx | 3 --- 6 files changed, 3 insertions(+), 11 deletions(-) diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index 7ab8bdfb25..8ac33e3d9d 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -538,7 +538,7 @@ auto if_else_zero(bool condition, const L& lhs) { * Unary minus. Returns the negative of given field, * iterates over whole domain including guard/boundary cells. */ -Field2D operator-(const Field2D& f); +inline auto operator-(const Field2D& f) { return -1.0 * f; } // Non-member functions diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 112e73219e..632b114273 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -859,7 +859,7 @@ Field3DParallel operator/(const Field3DParallel& lhs, BoutReal rhs); * Unary minus. Returns the negative of given field, * iterates over whole domain including guard/boundary cells. */ -Field3D operator-(const Field3D& f); +inline auto operator-(const Field3D& f) { return -1.0 * f; } // Non-member functions diff --git a/include/bout/fieldperp.hxx b/include/bout/fieldperp.hxx index b16bc562bb..aba78359e0 100644 --- a/include/bout/fieldperp.hxx +++ b/include/bout/fieldperp.hxx @@ -407,7 +407,7 @@ FieldPerp operator/(BoutReal lhs, const FieldPerp& rhs); * Unary minus. Returns the negative of given field, * iterates over whole domain including guard/boundary cells. */ -FieldPerp operator-(const FieldPerp& f); +inline auto operator-(const FieldPerp& f) { return -1.0 * f; } /// Create a FieldPerp by slicing a 3D field at a given y const FieldPerp sliceXZ(const Field3D& f, int y); diff --git a/src/field/field2d.cxx b/src/field/field2d.cxx index b363eeef07..61799e3444 100644 --- a/src/field/field2d.cxx +++ b/src/field/field2d.cxx @@ -331,9 +331,6 @@ void Field2D::swapData(Field2D& other) { std::swap(data, other.data); } ////////////// NON-MEMBER OVERLOADED OPERATORS ////////////// -// Unary minus -Field2D operator-(const Field2D& f) { return -1.0 * f; } - //////////////// NON-MEMBER FUNCTIONS ////////////////// namespace { diff --git a/src/field/field3d.cxx b/src/field/field3d.cxx index 238bef1993..6db47bddfc 100644 --- a/src/field/field3d.cxx +++ b/src/field/field3d.cxx @@ -709,8 +709,6 @@ void Field3D::swapData(Field3D& other) { std::swap(data, other.data); } * NON-MEMBER OVERLOADED OPERATORS ***************************************************************/ -Field3D operator-(const Field3D& f) { return -1.0 * f; } - //////////////// NON-MEMBER FUNCTIONS ////////////////// Field3D pow(const Field3D& lhs, const Field2D& rhs, const std::string& rgn) { diff --git a/src/field/fieldperp.cxx b/src/field/fieldperp.cxx index 5176a79c64..b7b2d9d731 100644 --- a/src/field/fieldperp.cxx +++ b/src/field/fieldperp.cxx @@ -150,9 +150,6 @@ FieldPerp fromFieldAligned(const FieldPerp& f, const std::string& region) { ////////////// NON-MEMBER OVERLOADED OPERATORS ////////////// -// Unary minus -FieldPerp operator-(const FieldPerp& f) { return -1.0 * f; } - ///////////////////////////////////////////////// // functions From 43c50503373785ca889494729c8b174792935c8a Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Fri, 19 Jun 2026 22:02:51 -0700 Subject: [PATCH 57/58] BOUT_FORCEINLINE: Strengthen for CPU case If using Clang or GCC set the always_inline attribute --- include/bout/build_config.hxx | 10 ++++++++++ include/bout/field2d.hxx | 4 ++-- include/bout/field3d.hxx | 14 ++------------ include/bout/fieldops.hxx | 19 ++++++++++++++++--- include/bout/fieldperp.hxx | 7 ++----- 5 files changed, 32 insertions(+), 22 deletions(-) diff --git a/include/bout/build_config.hxx b/include/bout/build_config.hxx index 60c55abad7..abc78e2cf0 100644 --- a/include/bout/build_config.hxx +++ b/include/bout/build_config.hxx @@ -53,6 +53,16 @@ constexpr auto use_msgstack = static_cast(BOUT_USE_MSGSTACK); #define BOUT_HOST __host__ #define BOUT_DEVICE __device__ #define BOUT_FORCEINLINE __forceinline__ +#elif defined(_MSC_VER) +#define BOUT_HOST_DEVICE +#define BOUT_HOST +#define BOUT_DEVICE +#define BOUT_FORCEINLINE __forceinline +#elif defined(__clang__) || defined(__GNUC__) +#define BOUT_HOST_DEVICE +#define BOUT_HOST +#define BOUT_DEVICE +#define BOUT_FORCEINLINE inline __attribute__((always_inline)) #else #define BOUT_HOST_DEVICE #define BOUT_HOST diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index 8ac33e3d9d..cc6482edcc 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -332,10 +332,10 @@ public: BoutReal* data; int mul = 1; int div = 1; - BOUT_HOST_DEVICE inline BoutReal operator()(int idx) const { + BOUT_HOST_DEVICE BOUT_FORCEINLINE BoutReal operator()(int idx) const { return data[(idx * mul / div)]; } - BOUT_HOST_DEVICE inline BoutReal& operator[](int idx) const { + BOUT_HOST_DEVICE BOUT_FORCEINLINE BoutReal& operator[](int idx) const { return data[(idx * mul) / div]; } diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 632b114273..4524573bee 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -459,10 +459,10 @@ public: BoutReal* data; int mul = 1; int div = 1; - BOUT_HOST_DEVICE inline BoutReal operator()(int idx) const { + BOUT_HOST_DEVICE BOUT_FORCEINLINE BoutReal operator()(int idx) const { return data[(idx * mul) / div]; } - BOUT_HOST_DEVICE inline BoutReal& operator[](int idx) const { + BOUT_HOST_DEVICE BOUT_FORCEINLINE BoutReal& operator[](int idx) const { return data[(idx * mul) / div]; } @@ -1077,16 +1077,6 @@ inline Field3D& Field3D::operator-=(const Field3DParallel& rhs) { return (*this) -= rhs.asField3D(); } -// A raw Field3D is an expression leaf -template <> -struct is_expr_field3d : std::true_type {}; - -template <> -struct is_expr_field3d : std::true_type {}; - -template <> -struct is_expr_field2d : std::true_type {}; - template struct is_expr_field3d> : std::integral_constant>::value diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 25765e6f6b..53e28042c9 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -20,12 +20,16 @@ class Mesh; class Field3D; +class Field3DParallel; class Field2D; class FieldPerp; template struct is_expr_field2d : std::false_type {}; +template <> +struct is_expr_field2d : std::true_type {}; + template inline constexpr bool is_expr_field2d_v = is_expr_field2d>::value; @@ -33,9 +37,18 @@ inline constexpr bool is_expr_field2d_v = is_expr_field2d>::valu template struct is_expr_field3d : std::false_type {}; +template <> +struct is_expr_field3d : std::true_type {}; + +template <> +struct is_expr_field3d : std::true_type {}; + template struct is_expr_fieldperp : std::false_type {}; +template <> +struct is_expr_fieldperp : std::true_type {}; + template inline constexpr bool is_expr_fieldperp_v = is_expr_fieldperp>::value; @@ -346,8 +359,8 @@ struct BinaryExpr { BinaryExpr& operator=(const BinaryExpr&) = delete; BinaryExpr& operator=(BinaryExpr&&) = delete; - inline int size() const { return indices.size(); } - inline BoutReal operator()(int idx) const { + BOUT_HOST_DEVICE BOUT_FORCEINLINE int size() const { return indices.size(); } + BOUT_HOST_DEVICE BOUT_FORCEINLINE BoutReal operator()(int idx) const { return f(idx, lhs, rhs); // single‐pass fusion } template @@ -359,7 +372,7 @@ struct BinaryExpr { return operator()(d.ind); } } - inline int regionIdx(int idx) const { return indices[idx]; } + BOUT_HOST_DEVICE BOUT_FORCEINLINE int regionIdx(int idx) const { return indices[idx]; } //operator ResT() { return ResT{*this}; } struct View { diff --git a/include/bout/fieldperp.hxx b/include/bout/fieldperp.hxx index aba78359e0..fc45102b89 100644 --- a/include/bout/fieldperp.hxx +++ b/include/bout/fieldperp.hxx @@ -338,10 +338,10 @@ public: BoutReal* data; int mul = 1; int div = 1; - BOUT_HOST_DEVICE inline BoutReal operator()(int idx) const { + BOUT_HOST_DEVICE BOUT_FORCEINLINE BoutReal operator()(int idx) const { return data[(idx * mul) / div]; } - BOUT_HOST_DEVICE inline BoutReal& operator[](int idx) const { + BOUT_HOST_DEVICE BOUT_FORCEINLINE BoutReal& operator[](int idx) const { return data[(idx * mul) / div]; } @@ -451,9 +451,6 @@ bool operator==(const FieldPerp& a, const FieldPerp& b); /// Output a string describing a FieldPerp to a stream std::ostream& operator<<(std::ostream& out, const FieldPerp& value); -template <> -struct is_expr_fieldperp : std::true_type {}; - template struct is_expr_fieldperp> : std::integral_constant> From 50839f16a3421d85aec0651632eaf22b788f6da0 Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Fri, 19 Jun 2026 22:40:02 -0700 Subject: [PATCH 58/58] Fields: Move move-assignment to headers Fix unitary minus so that it avoids dependency loop with Mesh. --- include/bout/field2d.hxx | 12 +++++++++++- include/bout/field3d.hxx | 36 +++++++++++++++++++++++++++++++++--- include/bout/fieldperp.hxx | 27 ++++++++++++++++++++++----- src/field/field3d.cxx | 23 ----------------------- src/mesh/difops.cxx | 12 +++++++++--- 5 files changed, 75 insertions(+), 35 deletions(-) diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index cc6482edcc..0a670fa0d0 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -538,7 +538,17 @@ auto if_else_zero(bool condition, const L& lhs) { * Unary minus. Returns the negative of given field, * iterates over whole domain including guard/boundary cells. */ -inline auto operator-(const Field2D& f) { return -1.0 * f; } +inline auto operator-(const Field2D& f) { + return BinaryExpr, Field2D, bout::op::Mul>{ + static_cast::View>(-1.0), + static_cast(f), + bout::op::Mul{}, + f.getMesh(), + f.getLocation(), + f.getDirections(), + std::nullopt, + f.getRegion("RGN_ALL")}; +} // Non-member functions diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 4524573bee..09c7f1ce55 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -207,7 +207,7 @@ public: setRegion(expr.getRegionID()); } /// Destructor - ~Field3D() override; + ~Field3D() override { delete deriv; } /// Data type stored in this field using value_type = BoutReal; @@ -482,7 +482,26 @@ public: /// Assignment operators ///@{ Field3D& operator=(const Field3D& rhs); - Field3D& operator=(Field3D&& rhs) noexcept; + Field3D& operator=(Field3D&& rhs) noexcept { + track(rhs, "operator="); + + // Move parallel slices or delete existing ones. + yup_fields = std::move(rhs.yup_fields); + ydown_fields = std::move(rhs.ydown_fields); + + // Move the data and data sizes + nx = rhs.nx; + ny = rhs.ny; + nz = rhs.nz; + regionID = rhs.regionID; + + data = std::move(rhs.data); + + // Move base slice last + Field::operator=(std::move(rhs)); + + return *this; + } Field3D& operator=(const Field2D& rhs); /// return void, as only part initialised void operator=(const FieldPerp& rhs); @@ -859,7 +878,18 @@ Field3DParallel operator/(const Field3DParallel& lhs, BoutReal rhs); * Unary minus. Returns the negative of given field, * iterates over whole domain including guard/boundary cells. */ -inline auto operator-(const Field3D& f) { return -1.0 * f; } +inline auto operator-(const Field3D& f) { + auto regionID = f.getRegionID(); + return BinaryExpr, Field3D, bout::op::Mul>{ + static_cast::View>(-1.0), + static_cast(f), + bout::op::Mul{}, + f.getMesh(), + f.getLocation(), + f.getDirections(), + regionID, + f.getRegion("RGN_ALL")}; +} // Non-member functions diff --git a/include/bout/fieldperp.hxx b/include/bout/fieldperp.hxx index fc45102b89..36a116e1b5 100644 --- a/include/bout/fieldperp.hxx +++ b/include/bout/fieldperp.hxx @@ -95,7 +95,9 @@ public: template < typename ResT, typename L, typename R, typename Func, - typename = std::enable_if_t<(is_expr_fieldperp_v && is_expr_fieldperp_v)>> + typename = std::enable_if_t<(is_expr_fieldperp_v && is_expr_fieldperp_v) + || (is_expr_constant_v && is_expr_fieldperp_v) + || (is_expr_fieldperp_v && is_expr_constant_v)>> FieldPerp(const BinaryExpr& expr) : FieldPerp(evaluateBinaryExpr(expr), expr.getMesh(), expr.getLocation(), expr.getIndex(), expr.getDirections()) {} @@ -109,7 +111,7 @@ public: FieldPerp& operator=(FieldPerp&& rhs) = default; FieldPerp& operator=(BoutReal rhs); template - std::enable_if_t, FieldPerp&> + std::enable_if_t || is_expr_constant_v, FieldPerp&> operator=(const BinaryExpr& expr) { if (!isAllocated() || getMesh() != expr.getMesh()) { *this = FieldPerp{expr}; @@ -407,7 +409,18 @@ FieldPerp operator/(BoutReal lhs, const FieldPerp& rhs); * Unary minus. Returns the negative of given field, * iterates over whole domain including guard/boundary cells. */ -inline auto operator-(const FieldPerp& f) { return -1.0 * f; } +inline auto operator-(const FieldPerp& f) { + return BinaryExpr, FieldPerp, bout::op::Mul>{ + static_cast::View>(-1.0), + static_cast(f), + bout::op::Mul{}, + f.getMesh(), + f.getLocation(), + f.getDirections(), + std::nullopt, + f.getRegion("RGN_ALL"), + f.getIndex()}; +} /// Create a FieldPerp by slicing a 3D field at a given y const FieldPerp sliceXZ(const Field3D& f, int y); @@ -453,7 +466,11 @@ std::ostream& operator<<(std::ostream& out, const FieldPerp& value); template struct is_expr_fieldperp> - : std::integral_constant> - && is_expr_fieldperp_v>> {}; + : std::integral_constant> + && is_expr_fieldperp_v>) + || (is_expr_constant_v> + && is_expr_fieldperp_v>) + || (is_expr_fieldperp_v> + && is_expr_constant_v>)> {}; #endif diff --git a/src/field/field3d.cxx b/src/field/field3d.cxx index 6db47bddfc..7915579440 100644 --- a/src/field/field3d.cxx +++ b/src/field/field3d.cxx @@ -155,8 +155,6 @@ Field3D::Field3D(Array data_in, Mesh* localmesh, CELL_LOC datalocation ASSERT1(data.size() == nx * ny * nz); } -Field3D::~Field3D() { delete deriv; } - Field3D& Field3D::allocate() { if (data.empty()) { if (!fieldmesh) { @@ -313,27 +311,6 @@ Field3D& Field3D::operator=(const Field3D& rhs) { return *this; } -Field3D& Field3D::operator=(Field3D&& rhs) noexcept { - track(rhs, "operator="); - - // Move parallel slices or delete existing ones. - yup_fields = std::move(rhs.yup_fields); - ydown_fields = std::move(rhs.ydown_fields); - - // Move the data and data sizes - nx = rhs.nx; - ny = rhs.ny; - nz = rhs.nz; - regionID = rhs.regionID; - - data = std::move(rhs.data); - - // Move base slice last - Field::operator=(std::move(rhs)); - - return *this; -} - Field3D& Field3D::operator=(const Field2D& rhs) { track(rhs, "operator="); diff --git a/src/mesh/difops.cxx b/src/mesh/difops.cxx index 902ebfce93..3f7aec08bf 100644 --- a/src/mesh/difops.cxx +++ b/src/mesh/difops.cxx @@ -2,9 +2,9 @@ * Various differential operators defined on BOUT grid * ************************************************************************** - * Copyright 2010 B.D.Dudson, S.Farley, M.V.Umansky, X.Q.Xu + * Copyright 2010 - 2026 BOUT++ contributors * - * Contact: Ben Dudson, bd512@york.ac.uk + * Contact: Ben Dudson, dudson2@llnl.gov * * This file is part of BOUT++. * @@ -26,10 +26,16 @@ #include "bout/build_defines.hxx" #include +#include +#include +#include #include #include #include +#include +#include #include +#include #include #include #include @@ -800,7 +806,7 @@ Field3D bracket(const Field2D& f, const Field3D& g, BRACKET_METHOD method, break; case BRACKET_SIMPLE: { // Use a subset of terms for comparison to BOUT-06 - result = VDDZ(-DDX(f, outloc), g, outloc); + result = VDDZ(Field3D{-DDX(f, outloc)}, g, outloc); break; } default: {