summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMudit <mudit.simlote@dell.com>2020-09-10 10:29:47 -0400
committerRichard Pioso <richard.pioso@dell.com>2020-09-30 18:33:53 -0400
commit101fc29686a4bf327521f7a4025a6c301db89e84 (patch)
treebfb18b0d06d8ba016fb2219b108b45e0950b35a0
parente2d0f3fd072d9a5260dc2012cc42b965c2649e82 (diff)
downloadironic-101fc29686a4bf327521f7a4025a6c301db89e84.tar.gz
Add GPU reporting to idrac-wsman inspect interface
This patch implements reporting number of NVIDIA Tesla T4 devices connected to a system by discovering such devices and reporting them through capability 'pci_gpu_devices'. Change-Id: If713895f05f08a9827c4c085108abb3e388b2a2e Story: 2008118 Task: 40839 Depends-On: https://review.opendev.org/#/c/750364/
-rw-r--r--doc/source/admin/drivers/idrac.rst1
-rw-r--r--driver-requirements.txt2
-rw-r--r--ironic/drivers/modules/drac/inspect.py24
-rw-r--r--ironic/tests/unit/drivers/modules/drac/test_inspect.py115
-rw-r--r--releasenotes/notes/idrac-add-gpu-reporting-support-f4d80e2071f85f6a.yaml8
5 files changed, 145 insertions, 5 deletions
diff --git a/doc/source/admin/drivers/idrac.rst b/doc/source/admin/drivers/idrac.rst
index 15a1a6671..f77eb8dc4 100644
--- a/doc/source/admin/drivers/idrac.rst
+++ b/doc/source/admin/drivers/idrac.rst
@@ -259,6 +259,7 @@ The inspection discovers the following properties:
Extra capabilities:
* ``boot_mode``: UEFI or BIOS boot mode.
+* ``pci_gpu_devices``: number of GPU devices connected to the bare metal.
It also creates baremetal ports for each NIC port detected in the system.
The ``idrac-wsman`` inspect interface discovers which NIC ports are
diff --git a/driver-requirements.txt b/driver-requirements.txt
index ec736ad6c..b00680fa2 100644
--- a/driver-requirements.txt
+++ b/driver-requirements.txt
@@ -7,7 +7,7 @@
proliantutils>=2.10.0
pysnmp>=4.3.0,<5.0.0
python-scciclient>=0.8.0
-python-dracclient>=3.1.0,<6.0.0
+python-dracclient>=5.1.0,<6.0.0
python-xclarityclient>=0.1.6
# The Redfish hardware type uses the Sushy library
diff --git a/ironic/drivers/modules/drac/inspect.py b/ironic/drivers/modules/drac/inspect.py
index 620a32273..77e48226f 100644
--- a/ironic/drivers/modules/drac/inspect.py
+++ b/ironic/drivers/modules/drac/inspect.py
@@ -49,6 +49,8 @@ class DracRedfishInspect(redfish_inspect.RedfishInspect):
class DracWSManInspect(base.InspectInterface):
+ _GPU_SUPPORTED_LIST = {"TU104GL [Tesla T4]"}
+
def get_properties(self):
"""Return the properties of the interface.
@@ -98,9 +100,12 @@ class DracWSManInspect(base.InspectInterface):
properties['cpu_arch'] = 'x86_64' if cpus[0].arch64 else 'x86'
bios_settings = client.list_bios_settings()
+ video_controllers = client.list_video_controllers()
current_capabilities = node.properties.get('capabilities', '')
new_capabilities = {
- 'boot_mode': bios_settings["BootMode"].current_value.lower()}
+ 'boot_mode': bios_settings["BootMode"].current_value.lower(),
+ 'pci_gpu_devices': self._calculate_gpus(video_controllers)}
+
capabilties = utils.get_updated_capabilities(current_capabilities,
new_capabilities)
properties['capabilities'] = capabilties
@@ -190,6 +195,23 @@ class DracWSManInspect(base.InspectInterface):
else:
return cpu.cores
+ def _calculate_gpus(self, video_controllers):
+ """Find actual GPU count.
+
+ This method reports number of NVIDIA Tesla T4 GPU devices present
+ on the server.
+
+ :param video_controllers: list of video controllers.
+
+ :returns: returns total gpu count.
+ """
+ gpu_cnt = 0
+ for video_controller in video_controllers:
+ for gpu in self._GPU_SUPPORTED_LIST:
+ if video_controller.description == gpu:
+ gpu_cnt += 1
+ return gpu_cnt
+
def _get_pxe_dev_nics(self, client, nics, node):
"""Get a list of pxe device interfaces.
diff --git a/ironic/tests/unit/drivers/modules/drac/test_inspect.py b/ironic/tests/unit/drivers/modules/drac/test_inspect.py
index 628f3c855..ecb9346f2 100644
--- a/ironic/tests/unit/drivers/modules/drac/test_inspect.py
+++ b/ironic/tests/unit/drivers/modules/drac/test_inspect.py
@@ -135,6 +135,23 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
'PxeDev4Interface': None}
nic_settings = {'LegacyBootProto': {'current_value': 'PXE'},
'FQDD': 'NIC.Embedded.1-1-1'}
+ video_controllers = [
+ {'id': 'Video.Embedded.1-1',
+ 'description': 'Integrated Matrox G200eW3 Graphics Controller',
+ 'function_number': 0,
+ 'manufacturer': 'Matrox Electronics Systems Ltd.',
+ 'pci_device_id': '0536',
+ 'pci_vendor_id': '102B',
+ 'pci_subdevice_id': '0737',
+ 'pci_subvendor_id': '1028'},
+ {'id': 'Video.Slot.7-1',
+ 'description': 'TU104GL [Tesla T4]',
+ 'function_number': 0,
+ 'manufacturer': 'NVIDIA Corporation',
+ 'pci_device_id': '1EB8',
+ 'pci_vendor_id': '10DE',
+ 'pci_subdevice_id': '12A2',
+ 'pci_subvendor_id': '10DE'}]
self.memory = [test_utils.dict_to_namedtuple(values=m) for m in memory]
self.cpus = [test_utils.dict_to_namedtuple(values=c) for c in cpus]
@@ -146,6 +163,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
self.bios_boot_settings = test_utils.dict_of_object(bios_boot_settings)
self.uefi_boot_settings = test_utils.dict_of_object(uefi_boot_settings)
self.nic_settings = test_utils.dict_of_object(nic_settings)
+ self.video_controllers = [test_utils.dict_to_namedtuple(values=vc)
+ for vc in video_controllers]
def test_get_properties(self):
expected = drac_common.COMMON_PROPERTIES
@@ -161,7 +180,7 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
'local_gb': 1116,
'cpus': 18,
'cpu_arch': 'x86_64',
- 'capabilities': 'boot_mode:uefi'}
+ 'capabilities': 'boot_mode:uefi,pci_gpu_devices:1'}
mock_client = mock.Mock()
mock_get_drac_client.return_value = mock_client
mock_client.list_memory.return_value = self.memory
@@ -169,6 +188,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
mock_client.list_virtual_disks.return_value = self.virtual_disks
mock_client.list_nics.return_value = self.nics
mock_client.list_bios_settings.return_value = self.uefi_boot_settings
+ mock_client.list_video_controllers.return_value = \
+ self.video_controllers
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
@@ -191,6 +212,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
mock_client.list_virtual_disks.side_effect = (
drac_exceptions.BaseClientException('boom'))
mock_client.list_bios_settings.return_value = self.bios_boot_settings
+ mock_client.list_video_controllers.return_value = \
+ self.video_controllers
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
@@ -207,7 +230,7 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
'local_gb': 279,
'cpus': 18,
'cpu_arch': 'x86_64',
- 'capabilities': 'boot_mode:uefi'}
+ 'capabilities': 'boot_mode:uefi,pci_gpu_devices:1'}
mock_client = mock.Mock()
mock_get_drac_client.return_value = mock_client
mock_client.list_memory.return_value = self.memory
@@ -216,6 +239,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
mock_client.list_physical_disks.return_value = self.physical_disks
mock_client.list_nics.return_value = self.nics
mock_client.list_bios_settings.return_value = self.uefi_boot_settings
+ mock_client.list_video_controllers.return_value = \
+ self.video_controllers
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
@@ -239,6 +264,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
mock_client.list_physical_disks.return_value = self.physical_disks
mock_client.list_nics.return_value = self.nics
mock_client.list_bios_settings.return_value = self.uefi_boot_settings
+ mock_client.list_video_controllers.return_value = \
+ self.video_controllers
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
@@ -248,6 +275,86 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
@mock.patch.object(drac_common, 'get_drac_client', spec_set=True,
autospec=True)
@mock.patch.object(objects.Port, 'create', spec_set=True, autospec=True)
+ def test_inspect_hardware_no_supported_gpu(self, mock_port_create,
+ mock_get_drac_client):
+ controllers = [
+ {'id': 'Video.Embedded.1-1',
+ 'description': 'Integrated Matrox G200eW3 Graphics Controller',
+ 'function_number': 0,
+ 'manufacturer': 'Matrox Electronics Systems Ltd.',
+ 'pci_device_id': '0536',
+ 'pci_vendor_id': '102B',
+ 'pci_subdevice_id': '0737',
+ 'pci_subvendor_id': '1028'},
+ {'id': 'Video.Slot.7-1',
+ 'description': 'GV100GL [Tesla V100 PCIe 16GB]]',
+ 'function_number': 0,
+ 'manufacturer': 'NVIDIA Corporation',
+ 'pci_device_id': '1DB4',
+ 'pci_vendor_id': '10DE',
+ 'pci_subdevice_id': '1214',
+ 'pci_subvendor_id': '10DE'}]
+
+ expected_node_properties = {
+ 'memory_mb': 32768,
+ 'local_gb': 279,
+ 'cpus': 18,
+ 'cpu_arch': 'x86_64',
+ 'capabilities': 'boot_mode:uefi,pci_gpu_devices:0'}
+ mock_client = mock.Mock()
+ mock_get_drac_client.return_value = mock_client
+ mock_client.list_memory.return_value = self.memory
+ mock_client.list_cpus.return_value = self.cpus
+ mock_client.list_virtual_disks.return_value = []
+ mock_client.list_physical_disks.return_value = self.physical_disks
+ mock_client.list_nics.return_value = self.nics
+ mock_client.list_bios_settings.return_value = self.uefi_boot_settings
+ video_controllers = [test_utils.dict_to_namedtuple(values=vc)
+ for vc in controllers]
+ mock_client.list_video_controllers.return_value = video_controllers
+
+ with task_manager.acquire(self.context, self.node.uuid,
+ shared=True) as task:
+ return_value = task.driver.inspect.inspect_hardware(task)
+
+ self.node.refresh()
+ self.assertEqual(expected_node_properties, self.node.properties)
+ self.assertEqual(states.MANAGEABLE, return_value)
+ self.assertEqual(2, mock_port_create.call_count)
+
+ @mock.patch.object(drac_common, 'get_drac_client', spec_set=True,
+ autospec=True)
+ @mock.patch.object(objects.Port, 'create', spec_set=True, autospec=True)
+ def test_inspect_hardware_no_gpu(self, mock_port_create,
+ mock_get_drac_client):
+ expected_node_properties = {
+ 'memory_mb': 32768,
+ 'local_gb': 279,
+ 'cpus': 18,
+ 'cpu_arch': 'x86_64',
+ 'capabilities': 'boot_mode:uefi,pci_gpu_devices:0'}
+ mock_client = mock.Mock()
+ mock_get_drac_client.return_value = mock_client
+ mock_client.list_memory.return_value = self.memory
+ mock_client.list_cpus.return_value = self.cpus
+ mock_client.list_virtual_disks.return_value = []
+ mock_client.list_physical_disks.return_value = self.physical_disks
+ mock_client.list_nics.return_value = self.nics
+ mock_client.list_bios_settings.return_value = self.uefi_boot_settings
+ mock_client.list_video_controllers.return_value = []
+
+ with task_manager.acquire(self.context, self.node.uuid,
+ shared=True) as task:
+ return_value = task.driver.inspect.inspect_hardware(task)
+
+ self.node.refresh()
+ self.assertEqual(expected_node_properties, self.node.properties)
+ self.assertEqual(states.MANAGEABLE, return_value)
+ self.assertEqual(2, mock_port_create.call_count)
+
+ @mock.patch.object(drac_common, 'get_drac_client', spec_set=True,
+ autospec=True)
+ @mock.patch.object(objects.Port, 'create', spec_set=True, autospec=True)
def test_inspect_hardware_with_existing_ports(self, mock_port_create,
mock_get_drac_client):
expected_node_properties = {
@@ -255,7 +362,7 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
'local_gb': 1116,
'cpus': 18,
'cpu_arch': 'x86_64',
- 'capabilities': 'boot_mode:uefi'}
+ 'capabilities': 'boot_mode:uefi,pci_gpu_devices:1'}
mock_client = mock.Mock()
mock_get_drac_client.return_value = mock_client
mock_client.list_memory.return_value = self.memory
@@ -263,6 +370,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
mock_client.list_virtual_disks.return_value = self.virtual_disks
mock_client.list_nics.return_value = self.nics
mock_client.list_bios_settings.return_value = self.uefi_boot_settings
+ mock_client.list_video_controllers.return_value = \
+ self.video_controllers
mock_port_create.side_effect = exception.MACAlreadyExists("boom")
diff --git a/releasenotes/notes/idrac-add-gpu-reporting-support-f4d80e2071f85f6a.yaml b/releasenotes/notes/idrac-add-gpu-reporting-support-f4d80e2071f85f6a.yaml
new file mode 100644
index 000000000..fb4a84148
--- /dev/null
+++ b/releasenotes/notes/idrac-add-gpu-reporting-support-f4d80e2071f85f6a.yaml
@@ -0,0 +1,8 @@
+---
+features:
+ - |
+ Adds support in ``idrac-wsman`` inspect hardware interface for reporting
+ number of GPU devices connected to a system. This information is advertised
+ through capability ``pci_gpu_devices``, which can be used to make
+ scheduling decisions for the node. Currently, NVIDIA Tesla T4 GPU devices
+ are reported.