summaryrefslogtreecommitdiff
path: root/openmp
diff options
context:
space:
mode:
authorKevin Sala <kevin.sala@bsc.es>2023-04-12 22:02:23 +0200
committerKevin Sala <kevin.sala@bsc.es>2023-05-09 15:34:15 +0200
commit843f496b71aece4097bcc780c3bb47bd0874844a (patch)
tree8d5b36d905ad454d648d0a6880ed01ca28d4fdc8 /openmp
parent36d8b449cfc9850513bb2ed6c07b5b8cc9f1ae3a (diff)
downloadllvm-843f496b71aece4097bcc780c3bb47bd0874844a.tar.gz
[OpenMP][libomptarget] Improve device info printing in NextGen plugins
This patch improves the device info printing in the NextGen plugins. The device info properties are composed of keys, values and units (if necessary). These properties are pushed into a queue by each vendor-specifc plugin, and later, these properties are printed processed and printed by the common Plugin Interface. The printing format is common across the different plugins. Differential Revision: https://reviews.llvm.org/D148178
Diffstat (limited to 'openmp')
-rw-r--r--openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp219
-rw-r--r--openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp12
-rw-r--r--openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h74
-rw-r--r--openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp301
-rw-r--r--openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp4
5 files changed, 464 insertions, 146 deletions
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 0d2d8fae149e..da8e318cf4c7 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -189,8 +189,11 @@ struct AMDGPUMemoryPoolTy {
/// Getter of the HSA memory pool.
hsa_amd_memory_pool_t get() const { return MemoryPool; }
- /// Indicate if it belongs to the global segment.
+ /// Indicate the segment which belongs to.
bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); }
+ bool isReadOnly() const { return (Segment == HSA_AMD_SEGMENT_READONLY); }
+ bool isPrivate() const { return (Segment == HSA_AMD_SEGMENT_PRIVATE); }
+ bool isGroup() const { return (Segment == HSA_AMD_SEGMENT_GROUP); }
/// Indicate if it is fine-grained memory. Valid only for global.
bool isFineGrained() const {
@@ -246,7 +249,6 @@ struct AMDGPUMemoryPoolTy {
return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s");
}
-private:
/// Get attribute from the memory pool.
template <typename Ty>
Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
@@ -255,6 +257,11 @@ private:
return Plugin::check(Status, "Error in hsa_amd_memory_pool_get_info: %s");
}
+ template <typename Ty>
+ hsa_status_t getAttrRaw(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
+ return hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
+ }
+
/// Get attribute from the memory pool relating to an agent.
template <typename Ty>
Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind,
@@ -266,6 +273,7 @@ private:
"Error in hsa_amd_agent_memory_pool_get_info: %s");
}
+private:
/// The HSA memory pool.
hsa_amd_memory_pool_t MemoryPool;
@@ -2100,8 +2108,206 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
}
/// Print information about the device.
- Error printInfoImpl() override {
- // TODO: Implement the basic info.
+ Error obtainInfoImpl(InfoQueueTy &Info) override {
+ char TmpChar[1000];
+ const char *TmpCharPtr;
+ uint16_t Major, Minor;
+ uint32_t TmpUInt, TmpUInt2;
+ uint32_t CacheSize[4];
+ size_t TmpSt;
+ bool TmpBool;
+ uint16_t WorkgrpMaxDim[3];
+ hsa_dim3_t GridMaxDim;
+ hsa_status_t Status, Status2;
+
+ Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major);
+ Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor);
+ if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS)
+ Info.add("HSA Runtime Version",
+ std::to_string(Major) + "." + std::to_string(Minor));
+
+ Info.add("HSA OpenMP Device Number", DeviceId);
+
+ Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add("Product Name", TmpChar);
+
+ Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add("Device Name", TmpChar);
+
+ Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add("Vendor Name", TmpChar);
+
+ hsa_device_type_t DevType;
+ Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType);
+ if (Status == HSA_STATUS_SUCCESS) {
+ switch (DevType) {
+ case HSA_DEVICE_TYPE_CPU:
+ TmpCharPtr = "CPU";
+ break;
+ case HSA_DEVICE_TYPE_GPU:
+ TmpCharPtr = "GPU";
+ break;
+ case HSA_DEVICE_TYPE_DSP:
+ TmpCharPtr = "DSP";
+ break;
+ default:
+ TmpCharPtr = "Unknown";
+ }
+ Info.add("Device Type", TmpCharPtr);
+ }
+
+ Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUES_MAX, TmpUInt);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add("Max Queues", TmpUInt);
+
+ Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MIN_SIZE, TmpUInt);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add("Queue Min Size", TmpUInt);
+
+ Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MAX_SIZE, TmpUInt);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add("Queue Max Size", TmpUInt);
+
+ // FIXME: This is deprecated according to HSA documentation. But using
+ // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
+ // runtime.
+ Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize);
+ if (Status == HSA_STATUS_SUCCESS) {
+ Info.add("Cache");
+
+ for (int I = 0; I < 4; I++)
+ if (CacheSize[I])
+ Info.add<InfoLevel2>("L" + std::to_string(I), CacheSize[I]);
+ }
+
+ Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add("Cacheline Size", TmpUInt);
+
+ Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add("Max Clock Freq", TmpUInt, "MHz");
+
+ Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add("Compute Units", TmpUInt);
+
+ Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add("SIMD per CU", TmpUInt);
+
+ Status = getDeviceAttrRaw(HSA_AGENT_INFO_FAST_F16_OPERATION, TmpBool);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add("Fast F16 Operation", TmpBool);
+
+ Status = getDeviceAttrRaw(HSA_AGENT_INFO_WAVEFRONT_SIZE, TmpUInt2);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add("Wavefront Size", TmpUInt2);
+
+ Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, TmpUInt);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add("Workgroup Max Size", TmpUInt);
+
+ Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim);
+ if (Status == HSA_STATUS_SUCCESS) {
+ Info.add("Workgroup Max Size per Dimension");
+ Info.add<InfoLevel2>("x", WorkgrpMaxDim[0]);
+ Info.add<InfoLevel2>("y", WorkgrpMaxDim[1]);
+ Info.add<InfoLevel2>("z", WorkgrpMaxDim[2]);
+ }
+
+ Status = getDeviceAttrRaw(
+ (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, TmpUInt);
+ if (Status == HSA_STATUS_SUCCESS) {
+ Info.add("Max Waves Per CU", TmpUInt);
+ Info.add("Max Work-item Per CU", TmpUInt * TmpUInt2);
+ }
+
+ Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add("Grid Max Size", TmpUInt);
+
+ Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim);
+ if (Status == HSA_STATUS_SUCCESS) {
+ Info.add("Grid Max Size per Dimension");
+ Info.add<InfoLevel2>("x", GridMaxDim.x);
+ Info.add<InfoLevel2>("y", GridMaxDim.y);
+ Info.add<InfoLevel2>("z", GridMaxDim.z);
+ }
+
+ Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add("Max fbarriers/Workgrp", TmpUInt);
+
+ Info.add("Memory Pools");
+ for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
+ std::string TmpStr, TmpStr2;
+
+ if (Pool->isGlobal())
+ TmpStr = "Global";
+ else if (Pool->isReadOnly())
+ TmpStr = "ReadOnly";
+ else if (Pool->isPrivate())
+ TmpStr = "Private";
+ else if (Pool->isGroup())
+ TmpStr = "Group";
+ else
+ TmpStr = "Unknown";
+
+ Info.add<InfoLevel2>(std::string("Pool ") + TmpStr);
+
+ if (Pool->isGlobal()) {
+ if (Pool->isFineGrained())
+ TmpStr2 += "Fine Grained ";
+ if (Pool->isCoarseGrained())
+ TmpStr2 += "Coarse Grained ";
+ if (Pool->supportsKernelArgs())
+ TmpStr2 += "Kernarg ";
+
+ Info.add<InfoLevel3>("Flags", TmpStr2);
+ }
+
+ Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add<InfoLevel3>("Size", TmpSt, "bytes");
+
+ Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
+ TmpBool);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add<InfoLevel3>("Allocatable", TmpBool);
+
+ Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
+ TmpSt);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add<InfoLevel3>("Runtime Alloc Granule", TmpSt, "bytes");
+
+ Status = Pool->getAttrRaw(
+ HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add<InfoLevel3>("Runtime Alloc Alignment", TmpSt, "bytes");
+
+ Status =
+ Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add<InfoLevel3>("Accessable by all", TmpBool);
+ }
+
+ Info.add("ISAs");
+ auto Err = utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
+ Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add<InfoLevel2>("Name", TmpChar);
+
+ return Status;
+ });
+
+ // Silently consume the error.
+ if (Err)
+ consumeError(std::move(Err));
+
return Plugin::success();
}
@@ -2126,6 +2332,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::check(Status, "Error in hsa_agent_get_info: %s");
}
+ template <typename Ty>
+ hsa_status_t getDeviceAttrRaw(uint32_t Kind, Ty &Value) {
+ return hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
+ }
+
/// Get the device agent.
hsa_agent_t getAgent() const override { return Agent; }
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 7ebc1d1092cd..8899f457ffd9 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -995,8 +995,16 @@ Error GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo) {
}
Error GenericDeviceTy::printInfo() {
- // TODO: Print generic information here
- return printInfoImpl();
+ InfoQueueTy InfoQueue;
+
+ // Get the vendor-specific info entries describing the device properties.
+ if (auto Err = obtainInfoImpl(InfoQueue))
+ return Err;
+
+ // Print all info entries.
+ InfoQueue.print();
+
+ return Plugin::success();
}
Error GenericDeviceTy::createEvent(void **EventPtrStorage) {
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index a91ea81183c2..542d5185e919 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -13,6 +13,7 @@
#include <cstddef>
#include <cstdint>
+#include <deque>
#include <list>
#include <map>
#include <shared_mutex>
@@ -33,6 +34,7 @@
#include "llvm/Support/Error.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MemoryBufferRef.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/TargetParser/Triple.h"
namespace llvm {
@@ -84,6 +86,76 @@ private:
__tgt_async_info *AsyncInfoPtr;
};
+/// The information level represents the level of a key-value property in the
+/// info tree print (i.e. indentation). The first level should be the default.
+enum InfoLevelKind { InfoLevel1 = 1, InfoLevel2, InfoLevel3 };
+
+/// Class for storing device information and later be printed. An object of this
+/// type acts as a queue of key-value properties. Each property has a key, a
+/// a value, and an optional unit for the value. For printing purposes, the
+/// information can be classified into several levels. These levels are useful
+/// for defining sections and subsections. Thus, each key-value property also
+/// has an additional field indicating to which level belongs to. Notice that
+/// we use the level to determine the indentation of the key-value property at
+/// printing time. See the enum InfoLevelKind for the list of accepted levels.
+class InfoQueueTy {
+ struct InfoQueueEntryTy {
+ std::string Key;
+ std::string Value;
+ std::string Units;
+ uint64_t Level;
+ };
+
+ std::deque<InfoQueueEntryTy> Queue;
+
+public:
+ /// Add a new info entry to the queue. The entry requires at least a key
+ /// string in \p Key. The value in \p Value is optional and can be any type
+ /// that is representable as a string. The units in \p Units is optional and
+ /// must be a string. The info level is a template parameter that defaults to
+ /// the first level (top level).
+ template <InfoLevelKind L = InfoLevel1, typename T = std::string>
+ void add(const std::string &Key, T Value = T(),
+ const std::string &Units = std::string()) {
+ assert(!Key.empty() && "Invalid info key");
+
+ // Convert the value to a string depending on its type.
+ if constexpr (std::is_same_v<T, bool>)
+ Queue.push_back({Key, Value ? "Yes" : "No", Units, L});
+ else if constexpr (std::is_arithmetic_v<T>)
+ Queue.push_back({Key, std::to_string(Value), Units, L});
+ else
+ Queue.push_back({Key, Value, Units, L});
+ }
+
+ /// Print all info entries added to the queue.
+ void print() const {
+ // We print four spances for each level.
+ constexpr uint64_t IndentSize = 4;
+
+ // Find the maximum key length (level + key) to compute the individual
+ // indentation of each entry.
+ uint64_t MaxKeySize = 0;
+ for (const auto &Entry : Queue) {
+ uint64_t KeySize = Entry.Key.size() + Entry.Level * IndentSize;
+ if (KeySize > MaxKeySize)
+ MaxKeySize = KeySize;
+ }
+
+ // Print all info entries.
+ for (const auto &Entry : Queue) {
+ // Compute the indentations for the current entry.
+ uint64_t KeyIndentSize = Entry.Level * IndentSize;
+ uint64_t ValIndentSize =
+ MaxKeySize - (Entry.Key.size() + KeyIndentSize) + IndentSize;
+
+ llvm::outs() << std::string(KeyIndentSize, ' ') << Entry.Key
+ << std::string(ValIndentSize, ' ') << Entry.Value
+ << (Entry.Units.empty() ? "" : " ") << Entry.Units << "\n";
+ }
+ }
+};
+
/// Class wrapping a __tgt_device_image and its offload entry table on a
/// specific device. This class is responsible for storing and managing
/// the offload entries for an image on a device.
@@ -645,7 +717,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Print information about the device.
Error printInfo();
- virtual Error printInfoImpl() = 0;
+ virtual Error obtainInfoImpl(InfoQueueTy &Info) = 0;
/// Getters of the grid values.
uint32_t getWarpSize() const { return GridValues.GV_Warp_Size; }
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index 9e38d851196c..d823cbebc3d7 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -620,147 +620,170 @@ struct CUDADeviceTy : public GenericDeviceTy {
}
/// Print information about the device.
- Error printInfoImpl() override {
+ Error obtainInfoImpl(InfoQueueTy &Info) override {
char TmpChar[1000];
- std::string TmpStr;
+ const char *TmpCharPtr;
size_t TmpSt;
- int TmpInt, TmpInt2, TmpInt3;
-
- // TODO: All these calls should be checked, but the whole printInfo must be
- // improved, so we will refactor it in the future.
- cuDriverGetVersion(&TmpInt);
- printf(" CUDA Driver Version: \t\t%d \n", TmpInt);
- printf(" CUDA Device Number: \t\t%d \n", DeviceId);
-
- cuDeviceGetName(TmpChar, 1000, Device);
- printf(" Device Name: \t\t\t%s \n", TmpChar);
-
- cuDeviceTotalMem(&TmpSt, Device);
- printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt);
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
- Device);
- printf(" Number of Multiprocessors: \t\t%d \n", TmpInt);
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device);
- printf(" Concurrent Copy and Execution: \t%s \n", TmpInt ? "Yes" : "No");
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
- Device);
- printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt);
-
- cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device);
- printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt);
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
- Device),
- printf(" Registers per Block: \t\t%d \n", TmpInt);
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
- printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt);
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
- Device);
- printf(" Maximum Threads per Block: \t\t%d \n", TmpInt);
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
- cuDeviceGetAttribute(&TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device);
- cuDeviceGetAttribute(&TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device);
- printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
- TmpInt3);
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device);
- cuDeviceGetAttribute(&TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device);
- cuDeviceGetAttribute(&TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device);
- printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
- TmpInt3);
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device);
- printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
- Device);
- printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt);
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device);
- printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt);
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,
- Device);
- printf(" Execution Timeout: \t\t\t%s \n", TmpInt ? "Yes" : "No");
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device);
- printf(" Integrated Device: \t\t\t%s \n", TmpInt ? "Yes" : "No");
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY,
- Device);
- printf(" Can Map Host Memory: \t\t%s \n", TmpInt ? "Yes" : "No");
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device);
- if (TmpInt == CU_COMPUTEMODE_DEFAULT)
- TmpStr = "DEFAULT";
- else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
- TmpStr = "PROHIBITED";
- else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
- TmpStr = "EXCLUSIVE PROCESS";
- else
- TmpStr = "unknown";
- printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str());
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS,
- Device);
- printf(" Concurrent Kernels: \t\t%s \n", TmpInt ? "Yes" : "No");
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device);
- printf(" ECC Enabled: \t\t\t%s \n", TmpInt ? "Yes" : "No");
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
- Device);
- printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
- Device);
- printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt);
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, Device);
- printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
-
- cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, Device);
- printf(" Max Threads Per SMP: \t\t%d \n", TmpInt);
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT,
- Device);
- printf(" Async Engines: \t\t\t%s (%d) \n", TmpInt ? "Yes" : "No",
- TmpInt);
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
- Device);
- printf(" Unified Addressing: \t\t%s \n", TmpInt ? "Yes" : "No");
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device);
- printf(" Managed Memory: \t\t\t%s \n", TmpInt ? "Yes" : "No");
+ int TmpInt;
+
+ CUresult Res = cuDriverGetVersion(&TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("CUDA Driver Version", TmpInt);
+
+ Info.add("CUDA OpenMP Device Number", DeviceId);
+
+ Res = cuDeviceGetName(TmpChar, 1000, Device);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Device Name", TmpChar);
+
+ Res = cuDeviceTotalMem(&TmpSt, Device);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Global Memory Size", TmpSt, "bytes");
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Number of Multiprocessors", TmpInt);
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Concurrent Copy and Execution", (bool)TmpInt);
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Total Constant Memory", TmpInt, "bytes");
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
+ TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Max Shared Memory per Block", TmpInt, "bytes");
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Registers per Block", TmpInt);
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_WARP_SIZE, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Warp Size", TmpInt);
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Maximum Threads per Block", TmpInt);
+
+ Info.add("Maximum Block Dimensions", "");
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add<InfoLevel2>("x", TmpInt);
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add<InfoLevel2>("y", TmpInt);
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add<InfoLevel2>("z", TmpInt);
+
+ Info.add("Maximum Grid Dimensions", "");
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add<InfoLevel2>("x", TmpInt);
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add<InfoLevel2>("y", TmpInt);
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add<InfoLevel2>("z", TmpInt);
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_PITCH, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Maximum Memory Pitch", TmpInt, "bytes");
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Texture Alignment", TmpInt, "bytes");
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Clock Rate", TmpInt, "kHz");
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Execution Timeout", (bool)TmpInt);
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_INTEGRATED, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Integrated Device", (bool)TmpInt);
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Can Map Host Memory", (bool)TmpInt);
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, TmpInt);
+ if (Res == CUDA_SUCCESS) {
+ if (TmpInt == CU_COMPUTEMODE_DEFAULT)
+ TmpCharPtr = "Default";
+ else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
+ TmpCharPtr = "Prohibited";
+ else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
+ TmpCharPtr = "Exclusive process";
+ else
+ TmpCharPtr = "Unknown";
+ Info.add("Compute Mode", TmpCharPtr);
+ }
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Concurrent Kernels", (bool)TmpInt);
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ECC_ENABLED, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("ECC Enabled", (bool)TmpInt);
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Memory Clock Rate", TmpInt, "kHz");
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Memory Bus Width", TmpInt, "bits");
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("L2 Cache Size", TmpInt, "bytes");
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
+ TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Max Threads Per SMP", TmpInt);
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Async Engines", TmpInt);
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS,
- Device);
- printf(" Concurrent Managed Memory: \t\t%s \n", TmpInt ? "Yes" : "No");
-
- cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device);
- printf(" Preemption Supported: \t\t%s \n", TmpInt ? "Yes" : "No");
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH,
- Device);
- printf(" Cooperative Launch: \t\t%s \n", TmpInt ? "Yes" : "No");
-
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device);
- printf(" Multi-Device Boars: \t\t%s \n", TmpInt ? "Yes" : "No");
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Unified Addressing", (bool)TmpInt);
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
- Device);
- cuDeviceGetAttribute(&TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
- Device);
- printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Managed Memory", (bool)TmpInt);
+
+ Res =
+ getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Concurrent Managed Memory", (bool)TmpInt);
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
+ TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Preemption Supported", (bool)TmpInt);
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Cooperative Launch", (bool)TmpInt);
+
+ Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, TmpInt);
+ if (Res == CUDA_SUCCESS)
+ Info.add("Multi-Device Boars", (bool)TmpInt);
+
+ Info.add("Compute Capabilities", ComputeCapability.str());
return Plugin::success();
}
@@ -797,6 +820,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::check(Res, "Error in cuDeviceGetAttribute: %s");
}
+ CUresult getDeviceAttrRaw(uint32_t Kind, int &Value) {
+ return cuDeviceGetAttribute(&Value, (CUdevice_attribute)Kind, Device);
+ }
+
/// See GenericDeviceTy::getComputeUnitKind().
std::string getComputeUnitKind() const override {
return ComputeCapability.str();
diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
index a9b828826b1b..a47015e2fa29 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -294,8 +294,8 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
Error syncEventImpl(void *EventPtr) override { return Plugin::success(); }
/// Print information about the device.
- Error printInfoImpl() override {
- printf(" This is a generic-elf-64bit device\n");
+ Error obtainInfoImpl(InfoQueueTy &Info) override {
+ Info.add("Device Type", "Generic-elf-64bit");
return Plugin::success();
}