5 files changed, 464 insertions, 146 deletions
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 0d2d8fae149e..da8e318cf4c7 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -189,8 +189,11 @@ struct AMDGPUMemoryPoolTy {
   /// Getter of the HSA memory pool.
   hsa_amd_memory_pool_t get() const { return MemoryPool; }
 
-  /// Indicate if it belongs to the global segment.
+  /// Indicate the segment which belongs to.
   bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); }
+  bool isReadOnly() const { return (Segment == HSA_AMD_SEGMENT_READONLY); }
+  bool isPrivate() const { return (Segment == HSA_AMD_SEGMENT_PRIVATE); }
+  bool isGroup() const { return (Segment == HSA_AMD_SEGMENT_GROUP); }
 
   /// Indicate if it is fine-grained memory. Valid only for global.
   bool isFineGrained() const {
@@ -246,7 +249,6 @@ struct AMDGPUMemoryPoolTy {
     return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s");
   }
 
-private:
   /// Get attribute from the memory pool.
   template <typename Ty>
   Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
@@ -255,6 +257,11 @@ private:
     return Plugin::check(Status, "Error in hsa_amd_memory_pool_get_info: %s");
   }
 
+  template <typename Ty>
+  hsa_status_t getAttrRaw(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
+    return hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
+  }
+
   /// Get attribute from the memory pool relating to an agent.
   template <typename Ty>
   Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind,
@@ -266,6 +273,7 @@ private:
                          "Error in hsa_amd_agent_memory_pool_get_info: %s");
   }
 
+private:
   /// The HSA memory pool.
   hsa_amd_memory_pool_t MemoryPool;
 
@@ -2100,8 +2108,206 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   }
 
   /// Print information about the device.
-  Error printInfoImpl() override {
-    // TODO: Implement the basic info.
+  Error obtainInfoImpl(InfoQueueTy &Info) override {
+    char TmpChar[1000];
+    const char *TmpCharPtr;
+    uint16_t Major, Minor;
+    uint32_t TmpUInt, TmpUInt2;
+    uint32_t CacheSize[4];
+    size_t TmpSt;
+    bool TmpBool;
+    uint16_t WorkgrpMaxDim[3];
+    hsa_dim3_t GridMaxDim;
+    hsa_status_t Status, Status2;
+
+    Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major);
+    Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor);
+    if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS)
+      Info.add("HSA Runtime Version",
+               std::to_string(Major) + "." + std::to_string(Minor));
+
+    Info.add("HSA OpenMP Device Number", DeviceId);
+
+    Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar);
+    if (Status == HSA_STATUS_SUCCESS)
+      Info.add("Product Name", TmpChar);
+
+    Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar);
+    if (Status == HSA_STATUS_SUCCESS)
+      Info.add("Device Name", TmpChar);
+
+    Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar);
+    if (Status == HSA_STATUS_SUCCESS)
+      Info.add("Vendor Name", TmpChar);
+
+    hsa_device_type_t DevType;
+    Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType);
+    if (Status == HSA_STATUS_SUCCESS) {
+      switch (DevType) {
+      case HSA_DEVICE_TYPE_CPU:
+        TmpCharPtr = "CPU";
+        break;
+      case HSA_DEVICE_TYPE_GPU:
+        TmpCharPtr = "GPU";
+        break;
+      case HSA_DEVICE_TYPE_DSP:
+        TmpCharPtr = "DSP";
+        break;
+      default:
+        TmpCharPtr = "Unknown";
+      }
+      Info.add("Device Type", TmpCharPtr);
+    }
+
+    Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUES_MAX, TmpUInt);
+    if (Status == HSA_STATUS_SUCCESS)
+      Info.add("Max Queues", TmpUInt);
+
+    Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MIN_SIZE, TmpUInt);
+    if (Status == HSA_STATUS_SUCCESS)
+      Info.add("Queue Min Size", TmpUInt);
+
+    Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MAX_SIZE, TmpUInt);
+    if (Status == HSA_STATUS_SUCCESS)
+      Info.add("Queue Max Size", TmpUInt);
+
+    // FIXME: This is deprecated according to HSA documentation. But using
+    // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
+    // runtime.
+    Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize);
+    if (Status == HSA_STATUS_SUCCESS) {
+      Info.add("Cache");
+
+      for (int I = 0; I < 4; I++)
+        if (CacheSize[I])
+          Info.add<InfoLevel2>("L" + std::to_string(I), CacheSize[I]);
+    }
+
+    Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt);
+    if (Status == HSA_STATUS_SUCCESS)
+      Info.add("Cacheline Size", TmpUInt);
+
+    Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
+    if (Status == HSA_STATUS_SUCCESS)
+      Info.add("Max Clock Freq", TmpUInt, "MHz");
+
+    Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt);
+    if (Status == HSA_STATUS_SUCCESS)
+      Info.add("Compute Units", TmpUInt);
+
+    Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt);
+    if (Status == HSA_STATUS_SUCCESS)
+      Info.add("SIMD per CU", TmpUInt);
+
+    Status = getDeviceAttrRaw(HSA_AGENT_INFO_FAST_F16_OPERATION, TmpBool);
+    if (Status == HSA_STATUS_SUCCESS)
+      Info.add("Fast F16 Operation", TmpBool);
+
+    Status = getDeviceAttrRaw(HSA_AGENT_INFO_WAVEFRONT_SIZE, TmpUInt2);
+    if (Status == HSA_STATUS_SUCCESS)
+      Info.add("Wavefront Size", TmpUInt2);
+
+    Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, TmpUInt);
+    if (Status == HSA_STATUS_SUCCESS)
+      Info.add("Workgroup Max Size", TmpUInt);
+
+    Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim);
+    if (Status == HSA_STATUS_SUCCESS) {
+      Info.add("Workgroup Max Size per Dimension");
+      Info.add<InfoLevel2>("x", WorkgrpMaxDim[0]);
+      Info.add<InfoLevel2>("y", WorkgrpMaxDim[1]);
+      Info.add<InfoLevel2>("z", WorkgrpMaxDim[2]);
+    }
+
+    Status = getDeviceAttrRaw(
+        (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, TmpUInt);
+    if (Status == HSA_STATUS_SUCCESS) {
+      Info.add("Max Waves Per CU", TmpUInt);
+      Info.add("Max Work-item Per CU", TmpUInt * TmpUInt2);
+    }
+
+    Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt);
+    if (Status == HSA_STATUS_SUCCESS)
+      Info.add("Grid Max Size", TmpUInt);
+
+    Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim);
+    if (Status == HSA_STATUS_SUCCESS) {
+      Info.add("Grid Max Size per Dimension");
+      Info.add<InfoLevel2>("x", GridMaxDim.x);
+      Info.add<InfoLevel2>("y", GridMaxDim.y);
+      Info.add<InfoLevel2>("z", GridMaxDim.z);
+    }
+
+    Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt);
+    if (Status == HSA_STATUS_SUCCESS)
+      Info.add("Max fbarriers/Workgrp", TmpUInt);
+
+    Info.add("Memory Pools");
+    for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
+      std::string TmpStr, TmpStr2;
+
+      if (Pool->isGlobal())
+        TmpStr = "Global";
+      else if (Pool->isReadOnly())
+        TmpStr = "ReadOnly";
+      else if (Pool->isPrivate())
+        TmpStr = "Private";
+      else if (Pool->isGroup())
+        TmpStr = "Group";
+      else
+        TmpStr = "Unknown";
+
+      Info.add<InfoLevel2>(std::string("Pool ") + TmpStr);
+
+      if (Pool->isGlobal()) {
+        if (Pool->isFineGrained())
+          TmpStr2 += "Fine Grained ";
+        if (Pool->isCoarseGrained())
+          TmpStr2 += "Coarse Grained ";
+        if (Pool->supportsKernelArgs())
+          TmpStr2 += "Kernarg ";
+
+        Info.add<InfoLevel3>("Flags", TmpStr2);
+      }
+
+      Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt);
+      if (Status == HSA_STATUS_SUCCESS)
+        Info.add<InfoLevel3>("Size", TmpSt, "bytes");
+
+      Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
+                                TmpBool);
+      if (Status == HSA_STATUS_SUCCESS)
+        Info.add<InfoLevel3>("Allocatable", TmpBool);
+
+      Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
+                                TmpSt);
+      if (Status == HSA_STATUS_SUCCESS)
+        Info.add<InfoLevel3>("Runtime Alloc Granule", TmpSt, "bytes");
+
+      Status = Pool->getAttrRaw(
+          HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt);
+      if (Status == HSA_STATUS_SUCCESS)
+        Info.add<InfoLevel3>("Runtime Alloc Alignment", TmpSt, "bytes");
+
+      Status =
+          Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool);
+      if (Status == HSA_STATUS_SUCCESS)
+        Info.add<InfoLevel3>("Accessable by all", TmpBool);
+    }
+
+    Info.add("ISAs");
+    auto Err = utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
+      Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar);
+      if (Status == HSA_STATUS_SUCCESS)
+        Info.add<InfoLevel2>("Name", TmpChar);
+
+      return Status;
+    });
+
+    // Silently consume the error.
+    if (Err)
+      consumeError(std::move(Err));
+
     return Plugin::success();
   }
 
@@ -2126,6 +2332,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Plugin::check(Status, "Error in hsa_agent_get_info: %s");
   }
 
+  template <typename Ty>
+  hsa_status_t getDeviceAttrRaw(uint32_t Kind, Ty &Value) {
+    return hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
+  }
+
   /// Get the device agent.
   hsa_agent_t getAgent() const override { return Agent; }
 
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 7ebc1d1092cd..8899f457ffd9 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -995,8 +995,16 @@ Error GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo) {
 }
 
 Error GenericDeviceTy::printInfo() {
-  // TODO: Print generic information here
-  return printInfoImpl();
+  InfoQueueTy InfoQueue;
+
+  // Get the vendor-specific info entries describing the device properties.
+  if (auto Err = obtainInfoImpl(InfoQueue))
+    return Err;
+
+  // Print all info entries.
+  InfoQueue.print();
+
+  return Plugin::success();
 }
 
 Error GenericDeviceTy::createEvent(void **EventPtrStorage) {
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index a91ea81183c2..542d5185e919 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -13,6 +13,7 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <deque>
 #include <list>
 #include <map>
 #include <shared_mutex>
@@ -33,6 +34,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBufferRef.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
 
 namespace llvm {
@@ -84,6 +86,76 @@ private:
   __tgt_async_info *AsyncInfoPtr;
 };
 
+/// The information level represents the level of a key-value property in the
+/// info tree print (i.e. indentation). The first level should be the default.
+enum InfoLevelKind { InfoLevel1 = 1, InfoLevel2, InfoLevel3 };
+
+/// Class for storing device information and later be printed. An object of this
+/// type acts as a queue of key-value properties. Each property has a key, a
+/// a value, and an optional unit for the value. For printing purposes, the
+/// information can be classified into several levels. These levels are useful
+/// for defining sections and subsections. Thus, each key-value property also
+/// has an additional field indicating to which level belongs to. Notice that
+/// we use the level to determine the indentation of the key-value property at
+/// printing time. See the enum InfoLevelKind for the list of accepted levels.
+class InfoQueueTy {
+  struct InfoQueueEntryTy {
+    std::string Key;
+    std::string Value;
+    std::string Units;
+    uint64_t Level;
+  };
+
+  std::deque<InfoQueueEntryTy> Queue;
+
+public:
+  /// Add a new info entry to the queue. The entry requires at least a key
+  /// string in \p Key. The value in \p Value is optional and can be any type
+  /// that is representable as a string. The units in \p Units is optional and
+  /// must be a string. The info level is a template parameter that defaults to
+  /// the first level (top level).
+  template <InfoLevelKind L = InfoLevel1, typename T = std::string>
+  void add(const std::string &Key, T Value = T(),
+           const std::string &Units = std::string()) {
+    assert(!Key.empty() && "Invalid info key");
+
+    // Convert the value to a string depending on its type.
+    if constexpr (std::is_same_v<T, bool>)
+      Queue.push_back({Key, Value ? "Yes" : "No", Units, L});
+    else if constexpr (std::is_arithmetic_v<T>)
+      Queue.push_back({Key, std::to_string(Value), Units, L});
+    else
+      Queue.push_back({Key, Value, Units, L});
+  }
+
+  /// Print all info entries added to the queue.
+  void print() const {
+    // We print four spances for each level.
+    constexpr uint64_t IndentSize = 4;
+
+    // Find the maximum key length (level + key) to compute the individual
+    // indentation of each entry.
+    uint64_t MaxKeySize = 0;
+    for (const auto &Entry : Queue) {
+      uint64_t KeySize = Entry.Key.size() + Entry.Level * IndentSize;
+      if (KeySize > MaxKeySize)
+        MaxKeySize = KeySize;
+    }
+
+    // Print all info entries.
+    for (const auto &Entry : Queue) {
+      // Compute the indentations for the current entry.
+      uint64_t KeyIndentSize = Entry.Level * IndentSize;
+      uint64_t ValIndentSize =
+          MaxKeySize - (Entry.Key.size() + KeyIndentSize) + IndentSize;
+
+      llvm::outs() << std::string(KeyIndentSize, ' ') << Entry.Key
+                   << std::string(ValIndentSize, ' ') << Entry.Value
+                   << (Entry.Units.empty() ? "" : " ") << Entry.Units << "\n";
+    }
+  }
+};
+
 /// Class wrapping a __tgt_device_image and its offload entry table on a
 /// specific device. This class is responsible for storing and managing
 /// the offload entries for an image on a device.
@@ -645,7 +717,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   /// Print information about the device.
   Error printInfo();
-  virtual Error printInfoImpl() = 0;
+  virtual Error obtainInfoImpl(InfoQueueTy &Info) = 0;
 
   /// Getters of the grid values.
   uint32_t getWarpSize() const { return GridValues.GV_Warp_Size; }
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index 9e38d851196c..d823cbebc3d7 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -620,147 +620,170 @@ struct CUDADeviceTy : public GenericDeviceTy {
   }
 
   /// Print information about the device.
-  Error printInfoImpl() override {
+  Error obtainInfoImpl(InfoQueueTy &Info) override {
     char TmpChar[1000];
-    std::string TmpStr;
+    const char *TmpCharPtr;
     size_t TmpSt;
-    int TmpInt, TmpInt2, TmpInt3;
-
-    // TODO: All these calls should be checked, but the whole printInfo must be
-    // improved, so we will refactor it in the future.
-    cuDriverGetVersion(&TmpInt);
-    printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
-    printf("    CUDA Device Number: \t\t%d \n", DeviceId);
-
-    cuDeviceGetName(TmpChar, 1000, Device);
-    printf("    Device Name: \t\t\t%s \n", TmpChar);
-
-    cuDeviceTotalMem(&TmpSt, Device);
-    printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-                         Device);
-    printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device);
-    printf("    Concurrent Copy and Execution: \t%s \n", TmpInt ? "Yes" : "No");
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
-                         Device);
-    printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
-
-    cuDeviceGetAttribute(
-        &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device);
-    printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
-                         Device),
-        printf("    Registers per Block: \t\t%d \n", TmpInt);
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
-    printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                         Device);
-    printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
-    cuDeviceGetAttribute(&TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device);
-    cuDeviceGetAttribute(&TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device);
-    printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
-           TmpInt3);
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device);
-    cuDeviceGetAttribute(&TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device);
-    cuDeviceGetAttribute(&TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device);
-    printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
-           TmpInt3);
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device);
-    printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
-                         Device);
-    printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device);
-    printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,
-                         Device);
-    printf("    Execution Timeout: \t\t\t%s \n", TmpInt ? "Yes" : "No");
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device);
-    printf("    Integrated Device: \t\t\t%s \n", TmpInt ? "Yes" : "No");
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY,
-                         Device);
-    printf("    Can Map Host Memory: \t\t%s \n", TmpInt ? "Yes" : "No");
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device);
-    if (TmpInt == CU_COMPUTEMODE_DEFAULT)
-      TmpStr = "DEFAULT";
-    else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
-      TmpStr = "PROHIBITED";
-    else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
-      TmpStr = "EXCLUSIVE PROCESS";
-    else
-      TmpStr = "unknown";
-    printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS,
-                         Device);
-    printf("    Concurrent Kernels: \t\t%s \n", TmpInt ? "Yes" : "No");
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device);
-    printf("    ECC Enabled: \t\t\t%s \n", TmpInt ? "Yes" : "No");
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
-                         Device);
-    printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
-                         Device);
-    printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, Device);
-    printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
-
-    cuDeviceGetAttribute(
-        &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, Device);
-    printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT,
-                         Device);
-    printf("    Async Engines: \t\t\t%s (%d) \n", TmpInt ? "Yes" : "No",
-           TmpInt);
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
-                         Device);
-    printf("    Unified Addressing: \t\t%s \n", TmpInt ? "Yes" : "No");
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device);
-    printf("    Managed Memory: \t\t\t%s \n", TmpInt ? "Yes" : "No");
+    int TmpInt;
+
+    CUresult Res = cuDriverGetVersion(&TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("CUDA Driver Version", TmpInt);
+
+    Info.add("CUDA OpenMP Device Number", DeviceId);
+
+    Res = cuDeviceGetName(TmpChar, 1000, Device);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Device Name", TmpChar);
+
+    Res = cuDeviceTotalMem(&TmpSt, Device);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Global Memory Size", TmpSt, "bytes");
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Number of Multiprocessors", TmpInt);
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Concurrent Copy and Execution", (bool)TmpInt);
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Total Constant Memory", TmpInt, "bytes");
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
+                           TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Max Shared Memory per Block", TmpInt, "bytes");
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Registers per Block", TmpInt);
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_WARP_SIZE, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Warp Size", TmpInt);
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Maximum Threads per Block", TmpInt);
+
+    Info.add("Maximum Block Dimensions", "");
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add<InfoLevel2>("x", TmpInt);
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add<InfoLevel2>("y", TmpInt);
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add<InfoLevel2>("z", TmpInt);
+
+    Info.add("Maximum Grid Dimensions", "");
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add<InfoLevel2>("x", TmpInt);
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add<InfoLevel2>("y", TmpInt);
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add<InfoLevel2>("z", TmpInt);
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_PITCH, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Maximum Memory Pitch", TmpInt, "bytes");
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Texture Alignment", TmpInt, "bytes");
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Clock Rate", TmpInt, "kHz");
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Execution Timeout", (bool)TmpInt);
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_INTEGRATED, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Integrated Device", (bool)TmpInt);
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Can Map Host Memory", (bool)TmpInt);
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, TmpInt);
+    if (Res == CUDA_SUCCESS) {
+      if (TmpInt == CU_COMPUTEMODE_DEFAULT)
+        TmpCharPtr = "Default";
+      else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
+        TmpCharPtr = "Prohibited";
+      else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
+        TmpCharPtr = "Exclusive process";
+      else
+        TmpCharPtr = "Unknown";
+      Info.add("Compute Mode", TmpCharPtr);
+    }
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Concurrent Kernels", (bool)TmpInt);
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ECC_ENABLED, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("ECC Enabled", (bool)TmpInt);
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Memory Clock Rate", TmpInt, "kHz");
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Memory Bus Width", TmpInt, "bits");
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("L2 Cache Size", TmpInt, "bytes");
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
+                           TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Max Threads Per SMP", TmpInt);
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Async Engines", TmpInt);
 
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS,
-                         Device);
-    printf("    Concurrent Managed Memory: \t\t%s \n", TmpInt ? "Yes" : "No");
-
-    cuDeviceGetAttribute(
-        &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device);
-    printf("    Preemption Supported: \t\t%s \n", TmpInt ? "Yes" : "No");
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH,
-                         Device);
-    printf("    Cooperative Launch: \t\t%s \n", TmpInt ? "Yes" : "No");
-
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device);
-    printf("    Multi-Device Boars: \t\t%s \n", TmpInt ? "Yes" : "No");
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Unified Addressing", (bool)TmpInt);
 
-    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                         Device);
-    cuDeviceGetAttribute(&TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                         Device);
-    printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Managed Memory", (bool)TmpInt);
+
+    Res =
+        getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Concurrent Managed Memory", (bool)TmpInt);
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
+                           TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Preemption Supported", (bool)TmpInt);
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Cooperative Launch", (bool)TmpInt);
+
+    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, TmpInt);
+    if (Res == CUDA_SUCCESS)
+      Info.add("Multi-Device Boars", (bool)TmpInt);
+
+    Info.add("Compute Capabilities", ComputeCapability.str());
 
     return Plugin::success();
   }
@@ -797,6 +820,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::check(Res, "Error in cuDeviceGetAttribute: %s");
   }
 
+  CUresult getDeviceAttrRaw(uint32_t Kind, int &Value) {
+    return cuDeviceGetAttribute(&Value, (CUdevice_attribute)Kind, Device);
+  }
+
   /// See GenericDeviceTy::getComputeUnitKind().
   std::string getComputeUnitKind() const override {
     return ComputeCapability.str();
diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
index a9b828826b1b..a47015e2fa29 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -294,8 +294,8 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   Error syncEventImpl(void *EventPtr) override { return Plugin::success(); }
 
   /// Print information about the device.
-  Error printInfoImpl() override {
-    printf("    This is a generic-elf-64bit device\n");
+  Error obtainInfoImpl(InfoQueueTy &Info) override {
+    Info.add("Device Type", "Generic-elf-64bit");
     return Plugin::success();
   }