5#ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_ 
    6#define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_ 
   20#include <ginkgo/core/base/device.hpp> 
   21#include <ginkgo/core/base/fwd_decls.hpp> 
   22#include <ginkgo/core/base/machine_topology.hpp> 
   23#include <ginkgo/core/base/memory.hpp> 
   24#include <ginkgo/core/base/scoped_device_id_guard.hpp> 
   25#include <ginkgo/core/base/types.hpp> 
   26#include <ginkgo/core/log/logger.hpp> 
   27#include <ginkgo/core/synthesizer/containers.hpp> 
   68constexpr allocation_mode default_cuda_alloc_mode = allocation_mode::device;
 
   70constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
 
   76    allocation_mode::unified_global;
 
   78#if (GINKGO_HIP_PLATFORM_HCC == 1) 
   81constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
 
   87    allocation_mode::unified_global;
 
  101enum class dpcpp_queue_property {
 
  113GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
 
  114                                                         dpcpp_queue_property b)
 
  116    return static_cast<dpcpp_queue_property
>(
static_cast<int>(a) |
 
  117                                             static_cast<int>(b));
 
  124#define GKO_FORWARD_DECLARE(_type, ...) class _type 
  126GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
 
  128#undef GKO_FORWARD_DECLARE 
  260#define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \ 
  261    virtual void run(std::shared_ptr<const _type>) const 
  263    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
 
  265#undef GKO_DECLARE_RUN_OVERLOAD 
  268    virtual void run(std::shared_ptr<const ReferenceExecutor> executor) 
const;
 
 
  291template <
typename Closure>
 
  292class RegisteredOperation : 
public Operation {
 
  300    RegisteredOperation(
const char* name, Closure op)
 
  301        : name_(name), op_(std::move(op))
 
  304    const char* 
get_name() 
const noexcept override { 
return name_; }
 
  306    void run(std::shared_ptr<const ReferenceExecutor> exec)
 const override 
  311    void run(std::shared_ptr<const OmpExecutor> exec)
 const override 
  316    void run(std::shared_ptr<const CudaExecutor> exec)
 const override 
  321    void run(std::shared_ptr<const HipExecutor> exec)
 const override 
  326    void run(std::shared_ptr<const DpcppExecutor> exec)
 const override 
  337template <
typename Closure>
 
  338RegisteredOperation<Closure> make_register_operation(
const char* name,
 
  341    return RegisteredOperation<Closure>{name, std::move(op)};
 
  419#define GKO_REGISTER_OPERATION(_name, _kernel)                                 \ 
  420    template <typename... Args>                                                \ 
  421    auto make_##_name(Args&&... args)                                          \ 
  423        return ::gko::detail::make_register_operation(                         \ 
  424            #_kernel, [&args...](auto exec) {                                  \ 
  425                using exec_type = decltype(exec);                              \ 
  428                        std::shared_ptr<const ::gko::ReferenceExecutor>>::     \ 
  430                    ::gko::kernels::reference::_kernel(                        \ 
  431                        std::dynamic_pointer_cast<                             \ 
  432                            const ::gko::ReferenceExecutor>(exec),             \ 
  433                        std::forward<Args>(args)...);                          \ 
  434                } else if (std::is_same<                                       \ 
  436                               std::shared_ptr<const ::gko::OmpExecutor>>::    \ 
  438                    ::gko::kernels::omp::_kernel(                              \ 
  439                        std::dynamic_pointer_cast<const ::gko::OmpExecutor>(   \ 
  441                        std::forward<Args>(args)...);                          \ 
  442                } else if (std::is_same<                                       \ 
  444                               std::shared_ptr<const ::gko::CudaExecutor>>::   \ 
  446                    ::gko::kernels::cuda::_kernel(                             \ 
  447                        std::dynamic_pointer_cast<const ::gko::CudaExecutor>(  \ 
  449                        std::forward<Args>(args)...);                          \ 
  450                } else if (std::is_same<                                       \ 
  452                               std::shared_ptr<const ::gko::HipExecutor>>::    \ 
  454                    ::gko::kernels::hip::_kernel(                              \ 
  455                        std::dynamic_pointer_cast<const ::gko::HipExecutor>(   \ 
  457                        std::forward<Args>(args)...);                          \ 
  458                } else if (std::is_same<                                       \ 
  460                               std::shared_ptr<const ::gko::DpcppExecutor>>::  \ 
  462                    ::gko::kernels::dpcpp::_kernel(                            \ 
  463                        std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \ 
  465                        std::forward<Args>(args)...);                          \ 
  467                    GKO_NOT_IMPLEMENTED;                                       \ 
  471    static_assert(true,                                                        \ 
  472                  "This assert is used to counter the false positive extra "   \ 
  473                  "semi-colon warnings") 
 
  513#define GKO_REGISTER_HOST_OPERATION(_name, _kernel)                          \ 
  514    template <typename... Args>                                              \ 
  515    auto make_##_name(Args&&... args)                                        \ 
  517        return ::gko::detail::make_register_operation(                       \ 
  519            [&args...](auto) { _kernel(std::forward<Args>(args)...); });     \ 
  521    static_assert(true,                                                      \ 
  522                  "This assert is used to counter the false positive extra " \ 
  523                  "semi-colon warnings") 
 
  526#define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type 
  616    template <
typename T>
 
  617    friend class detail::ExecutorBase;
 
  619    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
 
  620    friend class ReferenceExecutor;
 
  623    virtual ~Executor() = 
default;
 
  625    Executor() = 
default;
 
  626    Executor(Executor&) = 
delete;
 
  627    Executor(Executor&&) = 
delete;
 
  628    Executor& operator=(Executor&) = 
delete;
 
  629    Executor& operator=(Executor&&) = 
delete;
 
  652    template <
typename ClosureOmp, 
typename ClosureCuda, 
typename ClosureHip,
 
  653              typename ClosureDpcpp>
 
  655        "Please use the overload with std::string as first parameter.")
 
  656    void 
run(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
 
  657             const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
 const 
  659        LambdaOperation<ClosureOmp, ClosureOmp, ClosureCuda, ClosureHip,
 
  661            op(op_omp, op_cuda, op_hip, op_dpcpp);
 
 
  681    template <
typename ClosureReference, 
typename ClosureOmp,
 
  682              typename ClosureCuda, 
typename ClosureHip, 
typename ClosureDpcpp>
 
  683    void run(std::string name, 
const ClosureReference& op_ref,
 
  684             const ClosureOmp& op_omp, 
const ClosureCuda& op_cuda,
 
  685             const ClosureHip& op_hip, 
const ClosureDpcpp& op_dpcpp)
 const 
  687        LambdaOperation<ClosureReference, ClosureOmp, ClosureCuda, ClosureHip,
 
  689            op(std::move(name), op_ref, op_omp, op_cuda, op_hip, op_dpcpp);
 
 
  704    template <
typename T>
 
  707        this->
template log<log::Logger::allocation_started>(
 
  708            this, num_elems * 
sizeof(T));
 
  709        T* allocated = 
static_cast<T*
>(this->raw_alloc(num_elems * 
sizeof(T)));
 
  710        this->
template log<log::Logger::allocation_completed>(
 
  711            this, num_elems * 
sizeof(T), 
reinterpret_cast<uintptr>(allocated));
 
 
  722    void free(
void* ptr) 
const noexcept 
  724        this->
template log<log::Logger::free_started>(
 
  725            this, 
reinterpret_cast<uintptr>(ptr));
 
  727        this->
template log<log::Logger::free_completed>(
 
  728            this, 
reinterpret_cast<uintptr>(ptr));
 
 
  743    template <
typename T>
 
  745                   const T* src_ptr, T* dest_ptr)
 const 
  747        const auto src_loc = 
reinterpret_cast<uintptr>(src_ptr);
 
  748        const auto dest_loc = 
reinterpret_cast<uintptr>(dest_ptr);
 
  749        this->
template log<log::Logger::copy_started>(
 
  750            src_exec.
get(), 
this, src_loc, dest_loc, num_elems * 
sizeof(T));
 
  751        if (
this != src_exec.
get()) {
 
  752            src_exec->template log<log::Logger::copy_started>(
 
  753                src_exec.
get(), 
this, src_loc, dest_loc, num_elems * 
sizeof(T));
 
  756            this->raw_copy_from(src_exec.
get(), num_elems * 
sizeof(T), src_ptr,
 
  759#if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG) 
  762            std::clog << 
"Not direct copy. Try to copy data from the masters." 
  765            auto src_master = src_exec->get_master().
get();
 
  766            if (num_elems > 0 && src_master != src_exec.
get()) {
 
  767                auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);
 
  768                src_master->copy_from<T>(src_exec, num_elems, src_ptr,
 
  770                this->
copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);
 
  771                src_master->free(master_ptr);
 
  774        this->
template log<log::Logger::copy_completed>(
 
  775            src_exec.
get(), 
this, src_loc, dest_loc, num_elems * 
sizeof(T));
 
  776        if (
this != src_exec.
get()) {
 
  777            src_exec->template log<log::Logger::copy_completed>(
 
  778                src_exec.
get(), 
this, src_loc, dest_loc, num_elems * 
sizeof(T));
 
 
  793    template <
typename T>
 
  796        this->
copy_from(
this, num_elems, src_ptr, dest_ptr);
 
 
  808    template <
typename T>
 
  812        this->
get_master()->copy_from(
this, 1, ptr, &out);
 
 
  840        this->propagating_logger_refcount_.fetch_add(
 
  841            logger->needs_propagation() ? 1 : 0);
 
  842        this->EnableLogging<Executor>::add_logger(logger);
 
 
  853        this->propagating_logger_refcount_.fetch_sub(
 
  855        this->EnableLogging<Executor>::remove_logger(logger);
 
 
  858    using EnableLogging<
Executor>::remove_logger;
 
  869        log_propagation_mode_ = mode;
 
 
  881        return this->propagating_logger_refcount_.load() > 0 &&
 
 
  894        return this->verify_memory_from(other.get());
 
 
  916        std::string device_type;
 
  931        int num_computing_units = -1;
 
  944        int num_pu_per_cu = -1;
 
  954        std::vector<int> subgroup_sizes{};
 
  964        int max_subgroup_size = -1;
 
  976        std::vector<int> max_workitem_sizes{};
 
  987        int max_workgroup_size;
 
 1004        std::string pci_bus_id = std::string(13, 
'x');
 
 1016        std::vector<int> closest_pu_ids{};
 
 1024    const exec_info& get_exec_info()
 const { 
return this->exec_info_; }
 
 1035    virtual void* raw_alloc(size_type size) 
const = 0;
 
 1044    virtual void raw_free(
void* ptr) 
const noexcept = 0;
 
 1056    virtual void raw_copy_from(
const Executor* src_exec, size_type n_bytes,
 
 1057                               const void* src_ptr, 
void* dest_ptr) 
const = 0;
 
 1068#define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...)                              \ 
 1069    virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \ 
 1070                             const void* src_ptr, void* dest_ptr) const = 0 
 1072    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
 
 1074#undef GKO_ENABLE_RAW_COPY_TO 
 1083    virtual bool verify_memory_from(
const Executor* src_exec) 
const = 0;
 
 1094#define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \ 
 1095    virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0 
 1097    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
 
 1099    GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
 
 1101#undef GKO_ENABLE_VERIFY_MEMORY_TO 
 1109    virtual void populate_exec_info(
const machine_topology* mach_topo) = 0;
 
 1116    exec_info& get_exec_info() { 
return this->exec_info_; }
 
 1118    exec_info exec_info_;
 
 1122    std::atomic<int> propagating_logger_refcount_{};
 
 1139    template <
typename ClosureReference, 
typename ClosureOmp,
 
 1140              typename ClosureCuda, 
typename ClosureHip, 
typename ClosureDpcpp>
 
 1141    class LambdaOperation : 
public Operation {
 
 1143        LambdaOperation(std::string name, 
const ClosureReference& op_ref,
 
 1144                        const ClosureOmp& op_omp, 
const ClosureCuda& op_cuda,
 
 1145                        const ClosureHip& op_hip, 
const ClosureDpcpp& op_dpcpp)
 
 1146            : name_(std::move(name)),
 
 1164        LambdaOperation(
const ClosureOmp& op_omp, 
const ClosureCuda& op_cuda,
 
 1165                        const ClosureHip& op_hip, 
const ClosureDpcpp& op_dpcpp)
 
 1166            : LambdaOperation(
"unnamed", op_omp, op_omp, op_cuda, op_hip,
 
 1170        void run(std::shared_ptr<const OmpExecutor>)
 const override 
 1175        void run(std::shared_ptr<const ReferenceExecutor>)
 const override 
 1180        void run(std::shared_ptr<const CudaExecutor>)
 const override 
 1185        void run(std::shared_ptr<const HipExecutor>)
 const override 
 1190        void run(std::shared_ptr<const DpcppExecutor>)
 const override 
 1195        const char* get_name() const noexcept
 override { 
return name_.c_str(); }
 
 1199        ClosureReference op_ref_;
 
 1201        ClosureCuda op_cuda_;
 
 1203        ClosureDpcpp op_dpcpp_;
 
 
 1216template <
typename T>
 
 1243    std::shared_ptr<const Executor> exec_;
 
 
 1247template <
typename T>
 
 1248class executor_deleter<T[]> {
 
 1250    using pointer = T[];
 
 1252    explicit executor_deleter(std::shared_ptr<const Executor> exec)
 
 1256    void operator()(pointer ptr)
 const 
 1264    std::shared_ptr<const Executor> exec_;
 
 
 1271template <
typename ConcreteExecutor>
 
 1272class ExecutorBase : 
public Executor {
 
 1275    friend class ::gko::OmpExecutor;
 
 1276    friend class ::gko::HipExecutor;
 
 1277    friend class ::gko::DpcppExecutor;
 
 1278    friend class ::gko::CudaExecutor;
 
 1279    friend class ::gko::ReferenceExecutor;
 
 1282    void run(
const Operation& op)
 const override 
 1285        auto scope_guard = get_scoped_device_id_guard();
 
 1286        op.run(self()->shared_from_this());
 
 1291    void raw_copy_from(
const Executor* src_exec, size_type n_bytes,
 
 1292                       const void* src_ptr, 
void* dest_ptr)
 const override 
 1294        src_exec->raw_copy_to(self(), n_bytes, src_ptr, dest_ptr);
 
 1297    virtual bool verify_memory_from(
const Executor* src_exec)
 const override 
 1299        return src_exec->verify_memory_to(self());
 
 1303    ConcreteExecutor* self() noexcept
 
 1305        return static_cast<ConcreteExecutor*
>(
this);
 
 1308    const ConcreteExecutor* self() const noexcept
 
 1310        return static_cast<const ConcreteExecutor*
>(
this);
 
 1314#undef GKO_DECLARE_EXECUTOR_FRIEND 
 1324class EnableDeviceReset {
 
 1332        "device_reset is no longer supported, call " 
 1333        "cudaDeviceReset/hipDeviceReset manually")
 
 1334    void set_device_reset(
bool device_reset) {}
 
 1342        "device_reset is no longer supported, call " 
 1343        "cudaDeviceReset/hipDeviceReset manually")
 
 1344    bool get_device_reset() { 
return false; }
 
 1352    EnableDeviceReset() {}
 
 1355        "device_reset is no longer supported, call " 
 1356        "cudaDeviceReset/hipDeviceReset manually")
 
 1357    EnableDeviceReset(
bool device_reset) {}
 
 1364#define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...)                    \ 
 1365    void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \ 
 1366                     const void* src_ptr, void* dest_ptr) const override 
 1369#define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_)                     \ 
 1370    virtual bool verify_memory_to(const dest_* other) const override         \ 
 1374    static_assert(true,                                                      \ 
 1375                  "This assert is used to counter the false positive extra " \ 
 1376                  "semi-colon warnings") 
 1386class OmpExecutor : 
public detail::ExecutorBase<OmpExecutor>,
 
 1387                    public std::enable_shared_from_this<OmpExecutor> {
 
 1388    friend class detail::ExecutorBase<OmpExecutor>;
 
 1397        std::shared_ptr<CpuAllocatorBase> 
alloc =
 
 1398            std::make_shared<CpuAllocator>())
 
 1400        return std::shared_ptr<OmpExecutor>(
new OmpExecutor(std::move(
alloc)));
 
 
 1409    int get_num_cores()
 const 
 1411        return this->get_exec_info().num_computing_units;
 
 1414    int get_num_threads_per_core()
 const 
 1416        return this->get_exec_info().num_pu_per_cu;
 
 1419    static int get_num_omp_threads();
 
 1421    scoped_device_id_guard get_scoped_device_id_guard() 
const override;
 
 1426    OmpExecutor(std::shared_ptr<CpuAllocatorBase> 
alloc)
 
 1434    void* raw_alloc(
size_type size) 
const override;
 
 1436    void raw_free(
void* ptr) 
const noexcept override;
 
 1438    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
 
 1440    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(
OmpExecutor, 
true);
 
 1444    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(
HipExecutor, 
false);
 
 1446    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(
CudaExecutor, 
false);
 
 1448    bool verify_memory_to(
const DpcppExecutor* dest_exec) 
const override;
 
 1450    std::shared_ptr<CpuAllocatorBase> alloc_;
 
 
 1456using DefaultExecutor = OmpExecutor;
 
 1468class ReferenceExecutor : 
public OmpExecutor {
 
 1472    static std::shared_ptr<ReferenceExecutor> create(
 
 1473        std::shared_ptr<CpuAllocatorBase> 
alloc =
 
 1474            std::make_shared<CpuAllocator>())
 
 1476        return std::shared_ptr<ReferenceExecutor>(
 
 1477            new ReferenceExecutor(std::move(
alloc)));
 
 1490        op.run(std::static_pointer_cast<const ReferenceExecutor>(
 
 1491            this->shared_from_this()));
 
 
 1499        this->ReferenceExecutor::populate_exec_info(
 
 1503    void populate_exec_info(
const machine_topology*)
 override 
 1505        this->get_exec_info().device_id = -1;
 
 1506        this->get_exec_info().num_computing_units = 1;
 
 1507        this->get_exec_info().num_pu_per_cu = 1;
 
 1510    bool verify_memory_from(
const Executor* src_exec)
 const override 
 1512        return src_exec->verify_memory_to(
this);
 
 1515    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, 
true);
 
 1517    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, 
false);
 
 1519    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, 
false);
 
 1521    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, 
false);
 
 1523    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, 
false);
 
 
 1528namespace reference {
 
 1529using DefaultExecutor = ReferenceExecutor;
 
 1540class CudaExecutor : 
public detail::ExecutorBase<CudaExecutor>,
 
 1541                     public std::enable_shared_from_this<CudaExecutor>,
 
 1542                     public detail::EnableDeviceReset {
 
 1543    friend class detail::ExecutorBase<CudaExecutor>;
 
 1560        "calling this CudaExecutor::create method is deprecated, because" 
 1561        "device_reset no longer has an effect" 
 1562        "call CudaExecutor::create(" 
 1563        "  int device_id, std::shared_ptr<Executor> master," 
 1564        "  std::shared_ptr<CudaAllocatorBase> alloc," 
 1565        "  CUstream_st* stream);" 
 1568        int device_id, 
std::shared_ptr<
Executor> master, 
bool device_reset,
 
 1570        CUstream_st* stream = 
nullptr);
 
 1585        CUstream_st* stream = 
nullptr);
 
 1602        return this->get_exec_info().device_id;
 
 
 1615        return this->get_exec_info().num_pu_per_cu;
 
 
 1623        return this->get_exec_info().num_computing_units;
 
 
 1631        return this->get_exec_info().num_computing_units *
 
 1632               this->get_exec_info().num_pu_per_cu;
 
 
 1640        return this->get_exec_info().max_subgroup_size;
 
 
 1648        return this->get_exec_info().major;
 
 
 1656        return this->get_exec_info().minor;
 
 
 1664    GKO_DEPRECATED(
"use get_blas_handle() instead")
 
 1677    GKO_DEPRECATED(
"use get_sparselib_handle() instead")
 
 1688        return cusparse_handle_.get();
 
 
 1698        return this->get_exec_info().closest_pu_ids;
 
 
 1717    void set_gpu_property();
 
 1719    void init_handles();
 
 1721    CudaExecutor(
int device_id, std::shared_ptr<Executor> master,
 
 1722                 std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
 
 1723        : master_(master), alloc_{
std::move(alloc)}, stream_{stream}
 
 1725        this->get_exec_info().device_id = device_id;
 
 1726        this->get_exec_info().num_computing_units = 0;
 
 1727        this->get_exec_info().num_pu_per_cu = 0;
 
 1728        this->CudaExecutor::populate_exec_info(
 
 1730        this->set_gpu_property();
 
 1731        this->init_handles();
 
 1734    void* raw_alloc(
size_type size) 
const override;
 
 1736    void raw_free(
void* ptr) 
const noexcept override;
 
 1738    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
 
 1740    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, 
false);
 
 1742    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, 
false);
 
 1744    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, 
false);
 
 1746    bool verify_memory_to(
const HipExecutor* dest_exec) 
const override;
 
 1748    bool verify_memory_to(
const CudaExecutor* dest_exec) 
const override;
 
 1750    void populate_exec_info(
const machine_topology* mach_topo) 
override;
 
 1753    std::shared_ptr<Executor> master_;
 
 1755    template <
typename T>
 
 1756    using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
 
 1757    handle_manager<cublasContext> cublas_handle_;
 
 1758    handle_manager<cusparseContext> cusparse_handle_;
 
 1759    std::shared_ptr<CudaAllocatorBase> alloc_;
 
 1760    CUstream_st* stream_;
 
 
 1766using DefaultExecutor = CudaExecutor;
 
 1777class HipExecutor : 
public detail::ExecutorBase<HipExecutor>,
 
 1778                    public std::enable_shared_from_this<HipExecutor>,
 
 1779                    public detail::EnableDeviceReset {
 
 1780    friend class detail::ExecutorBase<HipExecutor>;
 
 1797        "device_reset is deprecated entirely, call hipDeviceReset directly. " 
 1798        "alloc_mode was replaced by the Allocator type " 
 1801        int device_id, 
std::shared_ptr<
Executor> master, 
bool device_reset,
 
 1803        GKO_HIP_STREAM_STRUCT* stream = 
nullptr);
 
 1805    static 
std::shared_ptr<HipExecutor> 
create(
 
 1809        GKO_HIP_STREAM_STRUCT* stream = 
nullptr);
 
 1826        return this->get_exec_info().device_id;
 
 
 1839        return this->get_exec_info().num_pu_per_cu;
 
 
 1847        return this->get_exec_info().num_computing_units;
 
 
 1855        return this->get_exec_info().major;
 
 
 1863        return this->get_exec_info().minor;
 
 
 1871        return this->get_exec_info().num_computing_units *
 
 1872               this->get_exec_info().num_pu_per_cu;
 
 
 1880        return this->get_exec_info().max_subgroup_size;
 
 
 1888    GKO_DEPRECATED(
"use get_blas_handle() instead")
 
 1901    GKO_DEPRECATED(
"use get_sparselib_handle() instead")
 
 1912        return hipsparse_handle_.get();
 
 
 1929        return this->get_exec_info().closest_pu_ids;
 
 
 1932    GKO_HIP_STREAM_STRUCT* get_stream()
 const { 
return stream_; }
 
 1935    void set_gpu_property();
 
 1937    void init_handles();
 
 1939    HipExecutor(
int device_id, std::shared_ptr<Executor> master,
 
 1940                std::shared_ptr<HipAllocatorBase> alloc,
 
 1941                GKO_HIP_STREAM_STRUCT* stream)
 
 1942        : master_{
std::move(master)}, alloc_{
std::move(alloc)}, stream_{stream}
 
 1944        this->get_exec_info().device_id = device_id;
 
 1945        this->get_exec_info().num_computing_units = 0;
 
 1946        this->get_exec_info().num_pu_per_cu = 0;
 
 1947        this->HipExecutor::populate_exec_info(machine_topology::get_instance());
 
 1948        this->set_gpu_property();
 
 1949        this->init_handles();
 
 1952    void* raw_alloc(size_type size) 
const override;
 
 1954    void raw_free(
void* ptr) 
const noexcept override;
 
 1956    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
 
 1958    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, 
false);
 
 1960    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, 
false);
 
 1962    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, 
false);
 
 1964    bool verify_memory_to(
const CudaExecutor* dest_exec) 
const override;
 
 1966    bool verify_memory_to(
const HipExecutor* dest_exec) 
const override;
 
 1968    void populate_exec_info(
const machine_topology* mach_topo) 
override;
 
 1971    std::shared_ptr<Executor> master_;
 
 1973    template <
typename T>
 
 1974    using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
 
 1975    handle_manager<hipblasContext> hipblas_handle_;
 
 1976    handle_manager<hipsparseContext> hipsparse_handle_;
 
 1977    std::shared_ptr<HipAllocatorBase> alloc_;
 
 1978    GKO_HIP_STREAM_STRUCT* stream_;
 
 
 1984using DefaultExecutor = HipExecutor;
 
 1995class DpcppExecutor : 
public detail::ExecutorBase<DpcppExecutor>,
 
 1996                      public std::enable_shared_from_this<DpcppExecutor> {
 
 1997    friend class detail::ExecutorBase<DpcppExecutor>;
 
 2011    static std::shared_ptr<DpcppExecutor> 
create(
 
 2012        int device_id, std::shared_ptr<Executor> master,
 
 2013        std::string device_type = 
"all",
 
 2014        dpcpp_queue_property property = dpcpp_queue_property::in_order);
 
 2033        return this->get_exec_info().device_id;
 
 
 2036    sycl::queue* get_queue()
 const { 
return queue_.get(); }
 
 2054        return this->get_exec_info().subgroup_sizes;
 
 
 2064        return this->get_exec_info().num_computing_units;
 
 
 2072        return this->get_exec_info().num_computing_units *
 
 2073               this->get_exec_info().num_pu_per_cu;
 
 
 2083        return this->get_exec_info().max_workitem_sizes;
 
 
 2093        return this->get_exec_info().max_workgroup_size;
 
 
 2103        return this->get_exec_info().max_subgroup_size;
 
 
 2113        return this->get_exec_info().device_type;
 
 
 2117    void set_device_property(
 
 2118        dpcpp_queue_property property = dpcpp_queue_property::in_order);
 
 2121        int device_id, std::shared_ptr<Executor> master,
 
 2122        std::string device_type = 
"all",
 
 2123        dpcpp_queue_property property = dpcpp_queue_property::in_order)
 
 2126        std::for_each(device_type.begin(), device_type.end(),
 
 2127                      [](
char& c) { c = std::tolower(c); });
 
 2128        this->get_exec_info().device_type = std::string(device_type);
 
 2129        this->get_exec_info().device_id = device_id;
 
 2130        this->set_device_property(property);
 
 2133    void populate_exec_info(
const machine_topology* mach_topo) 
override;
 
 2135    void* raw_alloc(size_type size) 
const override;
 
 2137    void raw_free(
void* ptr) 
const noexcept override;
 
 2139    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
 
 2141    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, 
false);
 
 2143    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, 
false);
 
 2145    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, 
false);
 
 2147    bool verify_memory_to(
const OmpExecutor* dest_exec) 
const override;
 
 2149    bool verify_memory_to(
const DpcppExecutor* dest_exec) 
const override;
 
 2152    std::shared_ptr<Executor> master_;
 
 2154    template <
typename T>
 
 2155    using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;
 
 2156    queue_manager<sycl::queue> queue_;
 
 
 2162using DefaultExecutor = DpcppExecutor;
 
 2167#undef GKO_OVERRIDE_RAW_COPY_TO 
Implement this interface to provide an allocator for CudaExecutor.
Definition memory.hpp:40
Allocator using cudaMalloc.
Definition memory.hpp:102
This is the Executor subclass which represents the CUDA device.
Definition executor.hpp:1542
static std::shared_ptr< CudaExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_cuda_alloc_mode, CUstream_st *stream=nullptr)
Creates a new CudaExecutor.
cusparseContext * get_sparselib_handle() const
Get the cusparse handle for this executor.
Definition executor.hpp:1686
cublasContext * get_blas_handle() const
Get the cublas handle for this executor.
Definition executor.hpp:1670
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition executor.hpp:1696
int get_closest_numa() const
Get the closest NUMA node.
Definition executor.hpp:1706
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition executor.hpp:1613
int get_major_version() const noexcept
Get the major version of compute capability.
Definition executor.hpp:1646
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition executor.hpp:1621
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition executor.hpp:1638
cusparseContext * get_cusparse_handle() const
Get the cusparse handle for this executor.
Definition executor.hpp:1678
int get_device_id() const noexcept
Get the CUDA device id of the device associated to this executor.
Definition executor.hpp:1600
void synchronize() const override
Synchronize the operations launched on the executor with its master.
CUstream_st * get_stream() const
Returns the CUDA stream used by this executor.
Definition executor.hpp:1714
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition executor.hpp:1654
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition executor.hpp:1629
cublasContext * get_cublas_handle() const
Get the cublas handle for this executor.
Definition executor.hpp:1665
std::string get_description() const override
static int get_num_devices()
Get the number of devices present on the system.
This is the Executor subclass which represents a DPC++ enhanced device.
Definition executor.hpp:1996
const std::vector< int > & get_subgroup_sizes() const noexcept
Get the available subgroup sizes for this device.
Definition executor.hpp:2052
int get_num_computing_units() const noexcept
Get the number of Computing Units of this executor.
Definition executor.hpp:2062
int get_max_workgroup_size() const noexcept
Get the maximum workgroup size.
Definition executor.hpp:2091
std::string get_description() const override
int get_num_subgroups() const noexcept
Get the number of subgroups of this executor.
Definition executor.hpp:2070
void synchronize() const override
Synchronize the operations launched on the executor with its master.
const std::vector< int > & get_max_workitem_sizes() const noexcept
Get the maximum work item sizes.
Definition executor.hpp:2081
int get_max_subgroup_size() const noexcept
Get the maximum subgroup size.
Definition executor.hpp:2101
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
std::string get_device_type() const noexcept
Get a string representing the device type.
Definition executor.hpp:2111
static int get_num_devices(std::string device_type)
Get the number of devices present on the system.
int get_device_id() const noexcept
Get the DPCPP device id of the device associated to this executor.
Definition executor.hpp:2031
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all", dpcpp_queue_property property=dpcpp_queue_property::in_order)
Creates a new DpcppExecutor.
The first step in using the Ginkgo library consists of creating an executor.
Definition executor.hpp:615
void free(void *ptr) const noexcept
Frees memory previously allocated with Executor::alloc().
Definition executor.hpp:722
virtual void run(const Operation &op) const =0
Runs the specified Operation using this Executor.
virtual void synchronize() const =0
Synchronize the operations launched on the executor with its master.
bool should_propagate_log() const
Returns true iff events occurring at an object created on this executor should be logged at propagati...
Definition executor.hpp:879
bool memory_accessible(const std::shared_ptr< const Executor > &other) const
Verifies whether the executors share the same memory.
Definition executor.hpp:892
void copy(size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data within this Executor.
Definition executor.hpp:794
void add_logger(std::shared_ptr< const log::Logger > logger) override
Definition executor.hpp:838
void copy_from(ptr_param< const Executor > src_exec, size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data from another Executor.
Definition executor.hpp:744
void set_log_propagation_mode(log_propagation_mode mode)
Sets the logger event propagation mode for the executor.
Definition executor.hpp:867
T * alloc(size_type num_elems) const
Allocates memory in this Executor.
Definition executor.hpp:705
virtual std::string get_description() const =0
virtual std::shared_ptr< Executor > get_master() noexcept=0
Returns the master OmpExecutor of this Executor.
T copy_val_to_host(const T *ptr) const
Retrieves a single element at the given location from executor memory.
Definition executor.hpp:809
void run(std::string name, const ClosureReference &op_ref, const ClosureOmp &op_omp, const ClosureCuda &op_cuda, const ClosureHip &op_hip, const ClosureDpcpp &op_dpcpp) const
Runs one of the passed in functors, depending on the Executor type.
Definition executor.hpp:683
void remove_logger(const log::Logger *logger) override
Definition executor.hpp:851
Implement this interface to provide an allocator for HipExecutor.
Definition memory.hpp:65
Definition memory.hpp:172
This is the Executor subclass which represents the HIP enhanced device.
Definition executor.hpp:1779
void synchronize() const override
Synchronize the operations launched on the executor with its master.
std::string get_description() const override
int get_device_id() const noexcept
Get the HIP device id of the device associated to this executor.
Definition executor.hpp:1824
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition executor.hpp:1837
int get_major_version() const noexcept
Get the major version of compute capability.
Definition executor.hpp:1853
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition executor.hpp:1927
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition executor.hpp:1861
static int get_num_devices()
Get the number of devices present on the system.
static std::shared_ptr< HipExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_hip_alloc_mode, CUstream_st *stream=nullptr)
Creates a new HipExecutor.
hipsparseContext * get_sparselib_handle() const
Get the hipsparse handle for this executor.
Definition executor.hpp:1910
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition executor.hpp:1845
hipblasContext * get_blas_handle() const
Get the hipblas handle for this executor.
Definition executor.hpp:1894
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition executor.hpp:1869
hipsparseContext * get_hipsparse_handle() const
Get the hipsparse handle for this executor.
Definition executor.hpp:1902
int get_closest_numa() const
Get the closest NUMA node.
Definition executor.hpp:1920
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition executor.hpp:1878
hipblasContext * get_hipblas_handle() const
Get the hipblas handle for this executor.
Definition executor.hpp:1889
NotSupported is thrown in case it is not possible to perform the requested operation on the given obj...
Definition exception.hpp:127
This is the Executor subclass which represents the OpenMP device (typically CPU).
Definition executor.hpp:1387
std::string get_description() const override
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
void synchronize() const override
Synchronize the operations launched on the executor with its master.
static std::shared_ptr< OmpExecutor > create(std::shared_ptr< CpuAllocatorBase > alloc=std::make_shared< CpuAllocator >())
Creates a new OmpExecutor.
Definition executor.hpp:1396
Operations can be used to define functionalities whose implementations differ among devices.
Definition executor.hpp:258
virtual const char * get_name() const noexcept
Returns the operation's name.
This is a specialization of the OmpExecutor, which runs the reference implementations of the kernels ...
Definition executor.hpp:1468
void run(const Operation &op) const override
Runs the specified Operation using this Executor.
Definition executor.hpp:1487
std::string get_description() const override
Definition executor.hpp:1485
executor_deleter(std::shared_ptr< const Executor > exec)
Creates a new deleter.
Definition executor.hpp:1226
void operator()(pointer ptr) const
Deletes the object.
Definition executor.hpp:1235
EnableLogging is a mixin which should be inherited by any class which wants to enable logging.
Definition logger.hpp:760
virtual bool needs_propagation() const
Returns true if this logger, when attached to an Executor, needs to be forwarded all events from obje...
Definition logger.hpp:654
The machine topology class represents the hierarchical topology of a machine, including NUMA nodes,...
Definition machine_topology.hpp:61
static machine_topology * get_instance()
Returns an instance of the machine_topology object.
Definition machine_topology.hpp:182
This class is used for function parameters in the place of raw pointers.
Definition utils_helper.hpp:41
T * get() const
Definition utils_helper.hpp:75
This move-only class uses RAII to set the device id within a scoped block, if necessary.
Definition scoped_device_id_guard.hpp:76
The logger namespace .
Definition convergence.hpp:22
The Ginkgo namespace.
Definition abstract_factory.hpp:20
std::uintptr_t uintptr
Unsigned integer type capable of holding a pointer to void.
Definition types.hpp:141
std::size_t size_type
Integral type used for allocation quantities.
Definition types.hpp:89
log_propagation_mode
How Logger events are propagated to their Executor.
Definition executor.hpp:34
@ automatic
Events get reported to loggers attached to the triggering object and propagating loggers (Logger::nee...
Definition executor.hpp:46
@ never
Events only get reported at loggers attached to the triggering object.
Definition executor.hpp:40
allocation_mode
Specify the mode of allocation for CUDA/HIP GPUs.
Definition executor.hpp:62