diff options
Diffstat (limited to 'ironic_python_agent/hardware.py')
-rw-r--r-- | ironic_python_agent/hardware.py | 121 |
1 files changed, 121 insertions, 0 deletions
diff --git a/ironic_python_agent/hardware.py b/ironic_python_agent/hardware.py index 0f7e4f82..dfcce6f8 100644 --- a/ironic_python_agent/hardware.py +++ b/ironic_python_agent/hardware.py @@ -924,6 +924,10 @@ class HardwareManager(object, metaclass=abc.ABCMeta): :param node: Ironic node object :param ports: list of Ironic port objects + :raises: ProtectedDeviceFound if a device has been identified which + may require manual intervention due to the contents and + operational risk which exists as it could also be a sign + of an environmental misconfiguration. :return: a dictionary in the form {device.name: erasure output} """ erase_results = {} @@ -937,6 +941,7 @@ class HardwareManager(object, metaclass=abc.ABCMeta): thread_pool = ThreadPool(min(max_pool_size, len(block_devices))) for block_device in block_devices: params = {'node': node, 'block_device': block_device} + safety_check_block_device(node, block_device.name) erase_results[block_device.name] = thread_pool.apply_async( dispatch_to_managers, ('erase_block_device',), params) thread_pool.close() @@ -1541,6 +1546,10 @@ class GenericHardwareManager(HardwareManager): :param ports: list of Ironic port objects :raises BlockDeviceEraseError: when there's an error erasing the block device + :raises: ProtectedDeviceFound if a device has been identified which + may require manual intervention due to the contents and + operational risk which exists as it could also be a sign + of an environmental misconfiguration. """ block_devices = self.list_block_devices(include_partitions=True) # NOTE(coreywright): Reverse sort by device name so a partition (eg @@ -1549,6 +1558,7 @@ class GenericHardwareManager(HardwareManager): block_devices.sort(key=lambda dev: dev.name, reverse=True) erase_errors = {} for dev in self._list_erasable_devices(): + safety_check_block_device(node, dev.name) try: disk_utils.destroy_disk_metadata(dev.name, node['uuid']) except processutils.ProcessExecutionError as e: @@ -1572,6 +1582,10 @@ class GenericHardwareManager(HardwareManager): :param ports: list of Ironic port objects :raises BlockDeviceEraseError: when there's an error erasing the block device + :raises: ProtectedDeviceFound if a device has been identified which + may require manual intervention due to the contents and + operational risk which exists as it could also be a sign + of an environmental misconfiguration. """ erase_errors = {} info = node.get('driver_internal_info', {}) @@ -1579,6 +1593,7 @@ class GenericHardwareManager(HardwareManager): LOG.debug("No erasable devices have been found.") return for dev in self._list_erasable_devices(): + safety_check_block_device(node, dev.name) try: if self._is_nvme(dev): execute_nvme_erase = info.get( @@ -2957,3 +2972,109 @@ def get_multipath_status(): # as if we directly try and work with the global var, we will be racing # tests endlessly. return MULTIPATH_ENABLED + + +def safety_check_block_device(node, device): + """Performs safety checking of a block device before destroying. + + In order to guard against distruction of file systems such as + shared-disk file systems + (https://en.wikipedia.org/wiki/Clustered_file_system#SHARED-DISK) + or similar filesystems where multiple distinct computers may have + unlocked concurrent IO access to the entire block device or + SAN Logical Unit Number, we need to evaluate, and block cleaning + from occuring on these filesystems *unless* we have been explicitly + configured to do so. + + This is because cleaning is an intentionally distructive operation, + and once started against such a device, given the complexities of + shared disk clustered filesystems where concurrent access is a design + element, in all likelihood the entire cluster can be negatively + impacted, and an operator will be forced to recover from snapshot and + or backups of the volume's contents. + + :param node: A node, or cached node object. + :param device: String representing the path to the block + device to be checked. + :raises: ProtectedDeviceError when a device is identified with + one of these known clustered filesystems, and the overall + settings have not indicated for the agent to skip such + safety checks. + """ + + # NOTE(TheJulia): While this seems super rare, I found out after this + # thread of discussion started that I have customers which have done + # this and wiped out SAN volumes and their contents unintentionally + # as a result of these filesystems not being guarded. + # For those not familiar with shared disk clustered filesystems, think + # of it as like your impacting a Ceph cluster, except your suddenly + # removing the underlying disks from the OSD, and the entire cluster + # goes down. + + if not CONF.guard_special_filesystems: + return + di_info = node.get('driver_internal_info', {}) + if not di_info.get('wipe_special_filesystems', True): + return + report, _e = il_utils.execute('lsblk', '-Pbia', + '-oFSTYPE,UUID,PTUUID,PARTTYPE,PARTUUID', + device) + + lines = report.splitlines() + + identified_fs_types = [] + identified_ids = [] + for line in lines: + device = {} + # Split into KEY=VAL pairs + vals = shlex.split(line) + if not vals: + continue + for key, val in (v.split('=', 1) for v in vals): + if key == 'FSTYPE': + identified_fs_types.append(val) + if key in ['UUID', 'PTUUID', 'PARTTYPE', 'PARTUUID']: + identified_ids.append(val) + # Ignore block types not specified + + _check_for_special_partitions_filesystems( + device, + identified_ids, + identified_fs_types) + + +def _check_for_special_partitions_filesystems(device, ids, fs_types): + """Compare supplied IDs, Types to known items, and raise if found. + + :param device: The block device in use, specificially for logging. + :param ids: A list above IDs found to check. + :param fs_types: A list of FS types found to check. + :raises: ProtectedDeviceError should a partition label or metadata + be discovered which suggests a shared disk clustered filesystem + has been discovered. + """ + + guarded_ids = { + # Apparently GPFS can used shared volumes.... + '37AFFC90-EF7D-4E96-91C3-2D7AE055B174': 'IBM GPFS Partition', + # Shared volume parallel filesystem + 'AA31E02A-400F-11DB-9590-000C2911D1B8': 'VMware VMFS Partition (GPT)', + '0xfb': 'VMware VMFS Partition (MBR)', + } + for key, value in guarded_ids.items(): + for id_value in ids: + if key == id_value: + raise errors.ProtectedDeviceError( + device=device, + what=value) + + guarded_fs_types = { + 'gfs2': 'Red Hat Global File System 2', + } + + for key, value in guarded_fs_types.items(): + for fs in fs_types: + if key == fs: + raise errors.ProtectedDeviceError( + device=device, + what=value) |