diff options
Diffstat (limited to 'swift/common/utils')
-rw-r--r-- | swift/common/utils/__init__.py | 1013 | ||||
-rw-r--r-- | swift/common/utils/libc.py | 487 | ||||
-rw-r--r-- | swift/common/utils/timestamp.py | 399 |
3 files changed, 1009 insertions, 890 deletions
diff --git a/swift/common/utils/__init__.py b/swift/common/utils/__init__.py index 3b4db177e..ef6b0180e 100644 --- a/swift/common/utils/__init__.py +++ b/swift/common/utils/__init__.py @@ -26,7 +26,6 @@ import fcntl import grp import hashlib import json -import math import operator import os import pwd @@ -37,12 +36,9 @@ import sys import time import uuid import functools -import platform import email.parser from random import random, shuffle from contextlib import contextmanager, closing -import ctypes -import ctypes.util from optparse import OptionParser import traceback import warnings @@ -97,90 +93,36 @@ from swift.common.linkat import linkat # For backwards compatability with 3rd party middlewares from swift.common.registry import register_swift_info, get_swift_info # noqa +from swift.common.utils.libc import ( # noqa + F_SETPIPE_SZ, + load_libc_function, + config_fallocate_value, + disable_fallocate, + fallocate, + punch_hole, + drop_buffer_cache, + get_md5_socket, + modify_priority, +) +from swift.common.utils.timestamp import ( # noqa + NORMAL_FORMAT, + INTERNAL_FORMAT, + SHORT_FORMAT, + MAX_OFFSET, + PRECISION, + Timestamp, + encode_timestamps, + decode_timestamps, + normalize_timestamp, + EPOCH, + last_modified_date_to_timestamp, + normalize_delete_at_timestamp, +) -# logging doesn't import patched as cleanly as one would like from logging.handlers import SysLogHandler import logging -logging.thread = eventlet.green.thread -logging.threading = eventlet.green.threading -logging._lock = logging.threading.RLock() -# setup notice level logging -NOTICE = 25 -logging.addLevelName(NOTICE, 'NOTICE') -SysLogHandler.priority_map['NOTICE'] = 'notice' - -# These are lazily pulled from libc elsewhere -_sys_fallocate = None -_posix_fadvise = None -_libc_socket = None -_libc_bind = None -_libc_accept = None -# see man -s 2 setpriority -_libc_setpriority = None -# see man -s 2 syscall -_posix_syscall = None - -# If set to non-zero, fallocate routines will fail based on free space -# available being at or below this amount, in bytes. -FALLOCATE_RESERVE = 0 -# Indicates if FALLOCATE_RESERVE is the percentage of free space (True) or -# the number of bytes (False). -FALLOCATE_IS_PERCENT = False - -# from /usr/include/linux/falloc.h -FALLOC_FL_KEEP_SIZE = 1 -FALLOC_FL_PUNCH_HOLE = 2 - -# from /usr/src/linux-headers-*/include/uapi/linux/resource.h -PRIO_PROCESS = 0 - - -# /usr/include/x86_64-linux-gnu/asm/unistd_64.h defines syscalls there -# are many like it, but this one is mine, see man -s 2 ioprio_set -def NR_ioprio_set(): - """Give __NR_ioprio_set value for your system.""" - architecture = os.uname()[4] - arch_bits = platform.architecture()[0] - # check if supported system, now support x86_64 and AArch64 - if architecture == 'x86_64' and arch_bits == '64bit': - return 251 - elif architecture == 'aarch64' and arch_bits == '64bit': - return 30 - raise OSError("Swift doesn't support ionice priority for %s %s" % - (architecture, arch_bits)) - - -# this syscall integer probably only works on x86_64 linux systems, you -# can check if it's correct on yours with something like this: -""" -#include <stdio.h> -#include <sys/syscall.h> - -int main(int argc, const char* argv[]) { - printf("%d\n", __NR_ioprio_set); - return 0; -} -""" - -# this is the value for "which" that says our who value will be a pid -# pulled out of /usr/src/linux-headers-*/include/linux/ioprio.h -IOPRIO_WHO_PROCESS = 1 - - -IO_CLASS_ENUM = { - 'IOPRIO_CLASS_RT': 1, - 'IOPRIO_CLASS_BE': 2, - 'IOPRIO_CLASS_IDLE': 3, -} - -# the IOPRIO_PRIO_VALUE "macro" is also pulled from -# /usr/src/linux-headers-*/include/linux/ioprio.h -IOPRIO_CLASS_SHIFT = 13 - - -def IOPRIO_PRIO_VALUE(class_, data): - return (((class_) << IOPRIO_CLASS_SHIFT) | data) +NOTICE = 25 # Used by hash_path to offer a bit more security when generating hashes for # paths. It simply appends this value to all paths; guessing the hash a path @@ -190,12 +132,6 @@ HASH_PATH_PREFIX = b'' SWIFT_CONF_FILE = '/etc/swift/swift.conf' -# These constants are Linux-specific, and Python doesn't seem to know -# about them. We ask anyway just in case that ever gets fixed. -# -# The values were copied from the Linux 3.x kernel headers. -AF_ALG = getattr(socket, 'AF_ALG', 38) -F_SETPIPE_SZ = getattr(fcntl, 'F_SETPIPE_SZ', 1031) O_TMPFILE = getattr(os, 'O_TMPFILE', 0o20000000 | os.O_DIRECTORY) # Used by the parse_socket_string() function to validate IPv6 addresses @@ -500,6 +436,17 @@ def config_read_prefixed_options(conf, prefix_name, defaults): return params +def logging_monkey_patch(): + # explicitly patch the logging lock + logging._lock = logging.threading.RLock() + # setup notice level logging + logging.addLevelName(NOTICE, 'NOTICE') + SysLogHandler.priority_map['NOTICE'] = 'notice' + # Trying to log threads while monkey-patched can lead to deadlocks; see + # https://bugs.launchpad.net/swift/+bug/1895739 + logging.logThreads = 0 + + def eventlet_monkey_patch(): """ Install the appropriate Eventlet monkey patches. @@ -510,13 +457,14 @@ def eventlet_monkey_patch(): # if thread is monkey-patched. eventlet.patcher.monkey_patch(all=False, socket=True, select=True, thread=True) - # Trying to log threads while monkey-patched can lead to deadlocks; see - # https://bugs.launchpad.net/swift/+bug/1895739 - logging.logThreads = 0 -def noop_libc_function(*args): - return 0 +def monkey_patch(): + """ + Apply all swift monkey patching consistently in one place. + """ + eventlet_monkey_patch() + logging_monkey_patch() def validate_configuration(): @@ -526,39 +474,6 @@ def validate_configuration(): sys.exit("Error: %s" % e) -def load_libc_function(func_name, log_error=True, - fail_if_missing=False, errcheck=False): - """ - Attempt to find the function in libc, otherwise return a no-op func. - - :param func_name: name of the function to pull from libc. - :param log_error: log an error when a function can't be found - :param fail_if_missing: raise an exception when a function can't be found. - Default behavior is to return a no-op function. - :param errcheck: boolean, if true install a wrapper on the function - to check for a return values of -1 and call - ctype.get_errno and raise an OSError - """ - try: - libc = ctypes.CDLL(ctypes.util.find_library('c'), use_errno=True) - func = getattr(libc, func_name) - except AttributeError: - if fail_if_missing: - raise - if log_error: - logging.warning(_("Unable to locate %s in libc. Leaving as a " - "no-op."), func_name) - return noop_libc_function - if errcheck: - def _errcheck(result, f, args): - if result == -1: - errcode = ctypes.get_errno() - raise OSError(errcode, os.strerror(errcode)) - return result - func.errcheck = _errcheck - return func - - def generate_trans_id(trans_id_suffix): return 'tx%s-%010x%s' % ( uuid.uuid4().hex[:21], int(time.time()), quote(trans_id_suffix)) @@ -755,25 +670,6 @@ def get_trans_id_time(trans_id): return None -def config_fallocate_value(reserve_value): - """ - Returns fallocate reserve_value as an int or float. - Returns is_percent as a boolean. - Returns a ValueError on invalid fallocate value. - """ - try: - if str(reserve_value[-1:]) == '%': - reserve_value = float(reserve_value[:-1]) - is_percent = True - else: - reserve_value = int(reserve_value) - is_percent = False - except ValueError: - raise ValueError('Error: %s is an invalid value for fallocate' - '_reserve.' % reserve_value) - return reserve_value, is_percent - - class FileLikeIter(object): def __init__(self, iterable): @@ -924,164 +820,6 @@ def fs_has_free_space(fs_path, space_needed, is_percent): return free_bytes >= space_needed -class _LibcWrapper(object): - """ - A callable object that forwards its calls to a C function from libc. - - These objects are lazy. libc will not be checked until someone tries to - either call the function or check its availability. - - _LibcWrapper objects have an "available" property; if true, then libc - has the function of that name. If false, then calls will fail with a - NotImplementedError. - """ - - def __init__(self, func_name): - self._func_name = func_name - self._func_handle = None - self._loaded = False - - def _ensure_loaded(self): - if not self._loaded: - func_name = self._func_name - try: - # Keep everything in this try-block in local variables so - # that a typo in self.some_attribute_name doesn't raise a - # spurious AttributeError. - func_handle = load_libc_function( - func_name, fail_if_missing=True) - self._func_handle = func_handle - except AttributeError: - # We pass fail_if_missing=True to load_libc_function and - # then ignore the error. It's weird, but otherwise we have - # to check if self._func_handle is noop_libc_function, and - # that's even weirder. - pass - self._loaded = True - - @property - def available(self): - self._ensure_loaded() - return bool(self._func_handle) - - def __call__(self, *args): - if self.available: - return self._func_handle(*args) - else: - raise NotImplementedError( - "No function %r found in libc" % self._func_name) - - -_fallocate_enabled = True -_fallocate_warned_about_missing = False -_sys_fallocate = _LibcWrapper('fallocate') -_sys_posix_fallocate = _LibcWrapper('posix_fallocate') - - -def disable_fallocate(): - global _fallocate_enabled - _fallocate_enabled = False - - -def fallocate(fd, size, offset=0): - """ - Pre-allocate disk space for a file. - - This function can be disabled by calling disable_fallocate(). If no - suitable C function is available in libc, this function is a no-op. - - :param fd: file descriptor - :param size: size to allocate (in bytes) - """ - global _fallocate_enabled - if not _fallocate_enabled: - return - - if size < 0: - size = 0 # Done historically; not really sure why - if size >= (1 << 63): - raise ValueError('size must be less than 2 ** 63') - if offset < 0: - raise ValueError('offset must be non-negative') - if offset >= (1 << 63): - raise ValueError('offset must be less than 2 ** 63') - - # Make sure there's some (configurable) amount of free space in - # addition to the number of bytes we're allocating. - if FALLOCATE_RESERVE: - st = os.fstatvfs(fd) - free = st.f_frsize * st.f_bavail - size - if FALLOCATE_IS_PERCENT: - free = (float(free) / float(st.f_frsize * st.f_blocks)) * 100 - if float(free) <= float(FALLOCATE_RESERVE): - raise OSError( - errno.ENOSPC, - 'FALLOCATE_RESERVE fail %g <= %g' % - (free, FALLOCATE_RESERVE)) - - if _sys_fallocate.available: - # Parameters are (fd, mode, offset, length). - # - # mode=FALLOC_FL_KEEP_SIZE pre-allocates invisibly (without - # affecting the reported file size). - ret = _sys_fallocate( - fd, FALLOC_FL_KEEP_SIZE, ctypes.c_uint64(offset), - ctypes.c_uint64(size)) - err = ctypes.get_errno() - elif _sys_posix_fallocate.available: - # Parameters are (fd, offset, length). - ret = _sys_posix_fallocate(fd, ctypes.c_uint64(offset), - ctypes.c_uint64(size)) - err = ctypes.get_errno() - else: - # No suitable fallocate-like function is in our libc. Warn about it, - # but just once per process, and then do nothing. - global _fallocate_warned_about_missing - if not _fallocate_warned_about_missing: - logging.warning(_("Unable to locate fallocate, posix_fallocate in " - "libc. Leaving as a no-op.")) - _fallocate_warned_about_missing = True - return - - if ret and err not in (0, errno.ENOSYS, errno.EOPNOTSUPP, - errno.EINVAL): - raise OSError(err, 'Unable to fallocate(%s)' % size) - - -def punch_hole(fd, offset, length): - """ - De-allocate disk space in the middle of a file. - - :param fd: file descriptor - :param offset: index of first byte to de-allocate - :param length: number of bytes to de-allocate - """ - if offset < 0: - raise ValueError('offset must be non-negative') - if offset >= (1 << 63): - raise ValueError('offset must be less than 2 ** 63') - if length <= 0: - raise ValueError('length must be positive') - if length >= (1 << 63): - raise ValueError('length must be less than 2 ** 63') - - if _sys_fallocate.available: - # Parameters are (fd, mode, offset, length). - ret = _sys_fallocate( - fd, - FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, - ctypes.c_uint64(offset), - ctypes.c_uint64(length)) - err = ctypes.get_errno() - if ret and err: - mode_str = "FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE" - raise OSError(err, "Unable to fallocate(%d, %s, %d, %d)" % ( - fd, mode_str, offset, length)) - else: - raise OSError(errno.ENOTSUP, - 'No suitable C function found for hole punching') - - def fsync(fd): """ Sync modified file data and metadata to disk. @@ -1131,402 +869,6 @@ def fsync_dir(dirpath): os.close(dirfd) -def drop_buffer_cache(fd, offset, length): - """ - Drop 'buffer' cache for the given range of the given file. - - :param fd: file descriptor - :param offset: start offset - :param length: length - """ - global _posix_fadvise - if _posix_fadvise is None: - _posix_fadvise = load_libc_function('posix_fadvise64') - # 4 means "POSIX_FADV_DONTNEED" - ret = _posix_fadvise(fd, ctypes.c_uint64(offset), - ctypes.c_uint64(length), 4) - if ret != 0: - logging.warning("posix_fadvise64(%(fd)s, %(offset)s, %(length)s, 4) " - "-> %(ret)s", {'fd': fd, 'offset': offset, - 'length': length, 'ret': ret}) - - -NORMAL_FORMAT = "%016.05f" -INTERNAL_FORMAT = NORMAL_FORMAT + '_%016x' -SHORT_FORMAT = NORMAL_FORMAT + '_%x' -MAX_OFFSET = (16 ** 16) - 1 -PRECISION = 1e-5 -# Setting this to True will cause the internal format to always display -# extended digits - even when the value is equivalent to the normalized form. -# This isn't ideal during an upgrade when some servers might not understand -# the new time format - but flipping it to True works great for testing. -FORCE_INTERNAL = False # or True - - -@functools.total_ordering -class Timestamp(object): - """ - Internal Representation of Swift Time. - - The normalized form of the X-Timestamp header looks like a float - with a fixed width to ensure stable string sorting - normalized - timestamps look like "1402464677.04188" - - To support overwrites of existing data without modifying the original - timestamp but still maintain consistency a second internal offset vector - is append to the normalized timestamp form which compares and sorts - greater than the fixed width float format but less than a newer timestamp. - The internalized format of timestamps looks like - "1402464677.04188_0000000000000000" - the portion after the underscore is - the offset and is a formatted hexadecimal integer. - - The internalized form is not exposed to clients in responses from - Swift. Normal client operations will not create a timestamp with an - offset. - - The Timestamp class in common.utils supports internalized and - normalized formatting of timestamps and also comparison of timestamp - values. When the offset value of a Timestamp is 0 - it's considered - insignificant and need not be represented in the string format; to - support backwards compatibility during a Swift upgrade the - internalized and normalized form of a Timestamp with an - insignificant offset are identical. When a timestamp includes an - offset it will always be represented in the internalized form, but - is still excluded from the normalized form. Timestamps with an - equivalent timestamp portion (the float part) will compare and order - by their offset. Timestamps with a greater timestamp portion will - always compare and order greater than a Timestamp with a lesser - timestamp regardless of it's offset. String comparison and ordering - is guaranteed for the internalized string format, and is backwards - compatible for normalized timestamps which do not include an offset. - """ - - def __init__(self, timestamp, offset=0, delta=0, check_bounds=True): - """ - Create a new Timestamp. - - :param timestamp: time in seconds since the Epoch, may be any of: - - * a float or integer - * normalized/internalized string - * another instance of this class (offset is preserved) - - :param offset: the second internal offset vector, an int - :param delta: deca-microsecond difference from the base timestamp - param, an int - """ - if isinstance(timestamp, bytes): - timestamp = timestamp.decode('ascii') - if isinstance(timestamp, six.string_types): - base, base_offset = timestamp.partition('_')[::2] - self.timestamp = float(base) - if '_' in base_offset: - raise ValueError('invalid literal for int() with base 16: ' - '%r' % base_offset) - if base_offset: - self.offset = int(base_offset, 16) - else: - self.offset = 0 - else: - self.timestamp = float(timestamp) - self.offset = getattr(timestamp, 'offset', 0) - # increment offset - if offset >= 0: - self.offset += offset - else: - raise ValueError('offset must be non-negative') - if self.offset > MAX_OFFSET: - raise ValueError('offset must be smaller than %d' % MAX_OFFSET) - self.raw = int(round(self.timestamp / PRECISION)) - # add delta - if delta: - self.raw = self.raw + delta - if self.raw <= 0: - raise ValueError( - 'delta must be greater than %d' % (-1 * self.raw)) - self.timestamp = float(self.raw * PRECISION) - if check_bounds: - if self.timestamp < 0: - raise ValueError('timestamp cannot be negative') - if self.timestamp >= 10000000000: - raise ValueError('timestamp too large') - - @classmethod - def now(cls, offset=0, delta=0): - return cls(time.time(), offset=offset, delta=delta) - - def __repr__(self): - return INTERNAL_FORMAT % (self.timestamp, self.offset) - - def __str__(self): - raise TypeError('You must specify which string format is required') - - def __float__(self): - return self.timestamp - - def __int__(self): - return int(self.timestamp) - - def __nonzero__(self): - return bool(self.timestamp or self.offset) - - def __bool__(self): - return self.__nonzero__() - - @property - def normal(self): - return NORMAL_FORMAT % self.timestamp - - @property - def internal(self): - if self.offset or FORCE_INTERNAL: - return INTERNAL_FORMAT % (self.timestamp, self.offset) - else: - return self.normal - - @property - def short(self): - if self.offset or FORCE_INTERNAL: - return SHORT_FORMAT % (self.timestamp, self.offset) - else: - return self.normal - - @property - def isoformat(self): - """ - Get an isoformat string representation of the 'normal' part of the - Timestamp with microsecond precision and no trailing timezone, for - example:: - - 1970-01-01T00:00:00.000000 - - :return: an isoformat string - """ - t = float(self.normal) - if six.PY3: - # On Python 3, round manually using ROUND_HALF_EVEN rounding - # method, to use the same rounding method than Python 2. Python 3 - # used a different rounding method, but Python 3.4.4 and 3.5.1 use - # again ROUND_HALF_EVEN as Python 2. - # See https://bugs.python.org/issue23517 - frac, t = math.modf(t) - us = round(frac * 1e6) - if us >= 1000000: - t += 1 - us -= 1000000 - elif us < 0: - t -= 1 - us += 1000000 - dt = datetime.datetime.utcfromtimestamp(t) - dt = dt.replace(microsecond=us) - else: - dt = datetime.datetime.utcfromtimestamp(t) - - isoformat = dt.isoformat() - # python isoformat() doesn't include msecs when zero - if len(isoformat) < len("1970-01-01T00:00:00.000000"): - isoformat += ".000000" - return isoformat - - @classmethod - def from_isoformat(cls, date_string): - """ - Parse an isoformat string representation of time to a Timestamp object. - - :param date_string: a string formatted as per an Timestamp.isoformat - property. - :return: an instance of this class. - """ - start = datetime.datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S.%f") - delta = start - EPOCH - # This calculation is based on Python 2.7's Modules/datetimemodule.c, - # function delta_to_microseconds(), but written in Python. - return cls(delta.total_seconds()) - - def ceil(self): - """ - Return the 'normal' part of the timestamp rounded up to the nearest - integer number of seconds. - - This value should be used whenever the second-precision Last-Modified - time of a resource is required. - - :return: a float value with second precision. - """ - return math.ceil(float(self)) - - def __eq__(self, other): - if other is None: - return False - if not isinstance(other, Timestamp): - try: - other = Timestamp(other, check_bounds=False) - except ValueError: - return False - return self.internal == other.internal - - def __ne__(self, other): - return not (self == other) - - def __lt__(self, other): - if other is None: - return False - if not isinstance(other, Timestamp): - other = Timestamp(other, check_bounds=False) - if other.timestamp < 0: - return False - if other.timestamp >= 10000000000: - return True - return self.internal < other.internal - - def __hash__(self): - return hash(self.internal) - - def __invert__(self): - if self.offset: - raise ValueError('Cannot invert timestamps with offsets') - return Timestamp((999999999999999 - self.raw) * PRECISION) - - -def encode_timestamps(t1, t2=None, t3=None, explicit=False): - """ - Encode up to three timestamps into a string. Unlike a Timestamp object, the - encoded string does NOT used fixed width fields and consequently no - relative chronology of the timestamps can be inferred from lexicographic - sorting of encoded timestamp strings. - - The format of the encoded string is: - <t1>[<+/-><t2 - t1>[<+/-><t3 - t2>]] - - i.e. if t1 = t2 = t3 then just the string representation of t1 is returned, - otherwise the time offsets for t2 and t3 are appended. If explicit is True - then the offsets for t2 and t3 are always appended even if zero. - - Note: any offset value in t1 will be preserved, but offsets on t2 and t3 - are not preserved. In the anticipated use cases for this method (and the - inverse decode_timestamps method) the timestamps passed as t2 and t3 are - not expected to have offsets as they will be timestamps associated with a - POST request. In the case where the encoding is used in a container objects - table row, t1 could be the PUT or DELETE time but t2 and t3 represent the - content type and metadata times (if different from the data file) i.e. - correspond to POST timestamps. In the case where the encoded form is used - in a .meta file name, t1 and t2 both correspond to POST timestamps. - """ - form = '{0}' - values = [t1.short] - if t2 is not None: - t2_t1_delta = t2.raw - t1.raw - explicit = explicit or (t2_t1_delta != 0) - values.append(t2_t1_delta) - if t3 is not None: - t3_t2_delta = t3.raw - t2.raw - explicit = explicit or (t3_t2_delta != 0) - values.append(t3_t2_delta) - if explicit: - form += '{1:+x}' - if t3 is not None: - form += '{2:+x}' - return form.format(*values) - - -def decode_timestamps(encoded, explicit=False): - """ - Parses a string of the form generated by encode_timestamps and returns - a tuple of the three component timestamps. If explicit is False, component - timestamps that are not explicitly encoded will be assumed to have zero - delta from the previous component and therefore take the value of the - previous component. If explicit is True, component timestamps that are - not explicitly encoded will be returned with value None. - """ - # TODO: some tests, e.g. in test_replicator, put float timestamps values - # into container db's, hence this defensive check, but in real world - # this may never happen. - if not isinstance(encoded, six.string_types): - ts = Timestamp(encoded) - return ts, ts, ts - - parts = [] - signs = [] - pos_parts = encoded.split('+') - for part in pos_parts: - # parse time components and their signs - # e.g. x-y+z --> parts = [x, y, z] and signs = [+1, -1, +1] - neg_parts = part.split('-') - parts = parts + neg_parts - signs = signs + [1] + [-1] * (len(neg_parts) - 1) - t1 = Timestamp(parts[0]) - t2 = t3 = None - if len(parts) > 1: - t2 = t1 - delta = signs[1] * int(parts[1], 16) - # if delta = 0 we want t2 = t3 = t1 in order to - # preserve any offset in t1 - only construct a distinct - # timestamp if there is a non-zero delta. - if delta: - t2 = Timestamp((t1.raw + delta) * PRECISION) - elif not explicit: - t2 = t1 - if len(parts) > 2: - t3 = t2 - delta = signs[2] * int(parts[2], 16) - if delta: - t3 = Timestamp((t2.raw + delta) * PRECISION) - elif not explicit: - t3 = t2 - return t1, t2, t3 - - -def normalize_timestamp(timestamp): - """ - Format a timestamp (string or numeric) into a standardized - xxxxxxxxxx.xxxxx (10.5) format. - - Note that timestamps using values greater than or equal to November 20th, - 2286 at 17:46 UTC will use 11 digits to represent the number of - seconds. - - :param timestamp: unix timestamp - :returns: normalized timestamp as a string - """ - return Timestamp(timestamp).normal - - -EPOCH = datetime.datetime(1970, 1, 1) - - -def last_modified_date_to_timestamp(last_modified_date_str): - """ - Convert a last modified date (like you'd get from a container listing, - e.g. 2014-02-28T23:22:36.698390) to a float. - """ - return Timestamp.from_isoformat(last_modified_date_str) - - -def normalize_delete_at_timestamp(timestamp, high_precision=False): - """ - Format a timestamp (string or numeric) into a standardized - xxxxxxxxxx (10) or xxxxxxxxxx.xxxxx (10.5) format. - - Note that timestamps less than 0000000000 are raised to - 0000000000 and values greater than November 20th, 2286 at - 17:46:39 UTC will be capped at that date and time, resulting in - no return value exceeding 9999999999.99999 (or 9999999999 if - using low-precision). - - This cap is because the expirer is already working through a - sorted list of strings that were all a length of 10. Adding - another digit would mess up the sort and cause the expirer to - break from processing early. By 2286, this problem will need to - be fixed, probably by creating an additional .expiring_objects - account to work from with 11 (or more) digit container names. - - :param timestamp: unix timestamp - :returns: normalized timestamp as a string - """ - fmt = '%016.5f' if high_precision else '%010d' - return fmt % min(max(0, float(timestamp)), 9999999999.99999) - - def mkdirs(path): """ Ensures the path is a directory or makes it if not. Errors if the path @@ -2073,6 +1415,11 @@ class SwiftLoggerAdapter(logging.LoggerAdapter): process() method to accomplish anything useful. """ + @property + def name(self): + # py3 does this for us already; add it for py2 + return self.logger.name + def get_metric_name(self, metric): # subclasses may override this method to annotate the metric name return metric @@ -2274,8 +1621,10 @@ class LogAdapter(logging.LoggerAdapter, object): emsg = '%s: %s' % (exc.__class__.__name__, exc.line) elif isinstance(exc, eventlet.Timeout): emsg = exc.__class__.__name__ - if hasattr(exc, 'seconds'): - emsg += ' (%ss)' % exc.seconds + detail = '%ss' % exc.seconds + if hasattr(exc, 'created_at'): + detail += ' after %0.2fs' % (time.time() - exc.created_at) + emsg += ' (%s)' % detail if isinstance(exc, swift.common.exceptions.MessageTimeout): if exc.msg: emsg += ' %s' % exc.msg @@ -3205,6 +2554,7 @@ def readconf(conf_path, section_name=None, log_name=None, defaults=None, # values like "1%" (which we want to support for # fallocate_reserve). c = ConfigParser(defaults, interpolation=NicerInterpolation()) + c.optionxform = str # Don't lower-case keys if hasattr(conf_path, 'readline'): if hasattr(conf_path, 'seek'): @@ -5107,87 +4457,6 @@ def parse_content_disposition(header): return header, attributes -class sockaddr_alg(ctypes.Structure): - _fields_ = [("salg_family", ctypes.c_ushort), - ("salg_type", ctypes.c_ubyte * 14), - ("salg_feat", ctypes.c_uint), - ("salg_mask", ctypes.c_uint), - ("salg_name", ctypes.c_ubyte * 64)] - - -_bound_md5_sockfd = None - - -def get_md5_socket(): - """ - Get an MD5 socket file descriptor. One can MD5 data with it by writing it - to the socket with os.write, then os.read the 16 bytes of the checksum out - later. - - NOTE: It is the caller's responsibility to ensure that os.close() is - called on the returned file descriptor. This is a bare file descriptor, - not a Python object. It doesn't close itself. - """ - - # Linux's AF_ALG sockets work like this: - # - # First, initialize a socket with socket() and bind(). This tells the - # socket what algorithm to use, as well as setting up any necessary bits - # like crypto keys. Of course, MD5 doesn't need any keys, so it's just the - # algorithm name. - # - # Second, to hash some data, get a second socket by calling accept() on - # the first socket. Write data to the socket, then when finished, read the - # checksum from the socket and close it. This lets you checksum multiple - # things without repeating all the setup code each time. - # - # Since we only need to bind() one socket, we do that here and save it for - # future re-use. That way, we only use one file descriptor to get an MD5 - # socket instead of two, and we also get to save some syscalls. - - global _bound_md5_sockfd - global _libc_socket - global _libc_bind - global _libc_accept - - if _libc_accept is None: - _libc_accept = load_libc_function('accept', fail_if_missing=True) - if _libc_socket is None: - _libc_socket = load_libc_function('socket', fail_if_missing=True) - if _libc_bind is None: - _libc_bind = load_libc_function('bind', fail_if_missing=True) - - # Do this at first call rather than at import time so that we don't use a - # file descriptor on systems that aren't using any MD5 sockets. - if _bound_md5_sockfd is None: - sockaddr_setup = sockaddr_alg( - AF_ALG, - (ord('h'), ord('a'), ord('s'), ord('h'), 0), - 0, 0, - (ord('m'), ord('d'), ord('5'), 0)) - hash_sockfd = _libc_socket(ctypes.c_int(AF_ALG), - ctypes.c_int(socket.SOCK_SEQPACKET), - ctypes.c_int(0)) - if hash_sockfd < 0: - raise IOError(ctypes.get_errno(), - "Failed to initialize MD5 socket") - - bind_result = _libc_bind(ctypes.c_int(hash_sockfd), - ctypes.pointer(sockaddr_setup), - ctypes.c_int(ctypes.sizeof(sockaddr_alg))) - if bind_result < 0: - os.close(hash_sockfd) - raise IOError(ctypes.get_errno(), "Failed to bind MD5 socket") - - _bound_md5_sockfd = hash_sockfd - - md5_sockfd = _libc_accept(ctypes.c_int(_bound_md5_sockfd), None, 0) - if md5_sockfd < 0: - raise IOError(ctypes.get_errno(), "Failed to accept MD5 socket") - - return md5_sockfd - - try: _test_md5 = hashlib.md5(usedforsecurity=False) # nosec @@ -5443,6 +4712,12 @@ class NamespaceBoundList(object): """ self.bounds = [] if bounds is None else bounds + def __eq__(self, other): + # test for equality of NamespaceBoundList objects only + if not isinstance(other, NamespaceBoundList): + return False + return self.bounds == other.bounds + @classmethod def parse(cls, namespaces): """ @@ -5498,7 +4773,12 @@ class NamespaceBoundList(object): def get_namespace(self, item): """ - Get a Namespace instance that contains ``item``. + Get a Namespace instance that contains ``item`` by bisecting on the + lower bounds directly. This function is used for performance sensitive + path, for example, '_get_update_shard' in proxy object controller. For + normal paths, convert NamespaceBoundList to a list of Namespaces, and + use `~swift.common.utils.find_namespace` or + `~swift.common.utils.filter_namespaces`. :param item: The item for a which a Namespace is to be found. :return: the Namespace that contains ``item``. @@ -5509,6 +4789,24 @@ class NamespaceBoundList(object): else self.bounds[pos + 1][0]) return Namespace(name, lower, upper) + def get_namespaces(self): + """ + Get the contained namespaces as a list of contiguous Namespaces ordered + by lower bound. + + :return: A list of Namespace objects which are ordered by + ``lower bound``. + """ + if not self.bounds: + return [] + namespaces = [] + num_ns = len(self.bounds) + for i in range(num_ns): + lower, name = self.bounds[i] + upper = ('' if i + 1 == num_ns else self.bounds[i + 1][0]) + namespaces.append(Namespace(name, lower, upper)) + return namespaces + class ShardName(object): """ @@ -5693,11 +4991,11 @@ class ShardRange(Namespace): '_deleted', '_state', '_count', '_bytes', '_tombstones', '_reported') - def __init__(self, name, timestamp, + def __init__(self, name, timestamp=0, lower=Namespace.MIN, upper=Namespace.MAX, object_count=0, bytes_used=0, meta_timestamp=None, deleted=False, state=None, state_timestamp=None, epoch=None, - reported=False, tombstones=-1): + reported=False, tombstones=-1, **kwargs): super(ShardRange, self).__init__(name=name, lower=lower, upper=upper) self.account = self.container = self._timestamp = \ self._meta_timestamp = self._state_timestamp = self._epoch = None @@ -5720,7 +5018,8 @@ class ShardRange(Namespace): def sort_key(cls, sr): # defines the sort order for shard ranges # note if this ever changes to *not* sort by upper first then it breaks - # a key assumption for bisect, which is used by utils.find_shard_range + # a key assumption for bisect, which is used by utils.find_namespace + # with shard ranges. return sr.upper, sr.state, sr.lower, sr.name def is_child_of(self, parent): @@ -6276,7 +5575,7 @@ class ShardRangeList(UserList): containing the filtered shard ranges. """ return ShardRangeList( - filter_shard_ranges(self, includes, marker, end_marker)) + filter_namespaces(self, includes, marker, end_marker)) def find_lower(self, condition): """ @@ -6297,44 +5596,45 @@ class ShardRangeList(UserList): return self.upper -def find_shard_range(item, ranges): +def find_namespace(item, namespaces): """ - Find a ShardRange in given list of ``shard_ranges`` whose namespace + Find a Namespace/ShardRange in given list of ``namespaces`` whose namespace contains ``item``. - :param item: The item for a which a ShardRange is to be found. - :param ranges: a sorted list of ShardRanges. - :return: the ShardRange whose namespace contains ``item``, or None if - no suitable range is found. + :param item: The item for a which a Namespace is to be found. + :param ranges: a sorted list of Namespaces. + :return: the Namespace/ShardRange whose namespace contains ``item``, or + None if no suitable Namespace is found. """ - index = bisect.bisect_left(ranges, item) - if index != len(ranges) and item in ranges[index]: - return ranges[index] + index = bisect.bisect_left(namespaces, item) + if index != len(namespaces) and item in namespaces[index]: + return namespaces[index] return None -def filter_shard_ranges(shard_ranges, includes, marker, end_marker): +def filter_namespaces(namespaces, includes, marker, end_marker): """ - Filter the given shard ranges to those whose namespace includes the - ``includes`` name or any part of the namespace between ``marker`` and + Filter the given Namespaces/ShardRanges to those whose namespace includes + the ``includes`` name or any part of the namespace between ``marker`` and ``end_marker``. If none of ``includes``, ``marker`` or ``end_marker`` are - specified then all shard ranges will be returned. + specified then all Namespaces will be returned. - :param shard_ranges: A list of :class:`~swift.common.utils.ShardRange`. - :param includes: a string; if not empty then only the shard range, if any, - whose namespace includes this string will be returned, and ``marker`` - and ``end_marker`` will be ignored. + :param namespaces: A list of :class:`~swift.common.utils.Namespace` or + :class:`~swift.common.utils.ShardRange`. + :param includes: a string; if not empty then only the Namespace, + if any, whose namespace includes this string will be returned, + ``marker`` and ``end_marker`` will be ignored. :param marker: if specified then only shard ranges whose upper bound is greater than this value will be returned. :param end_marker: if specified then only shard ranges whose lower bound is less than this value will be returned. - :return: A filtered list of :class:`~swift.common.utils.ShardRange`. + :return: A filtered list of :class:`~swift.common.utils.Namespace`. """ if includes: - shard_range = find_shard_range(includes, shard_ranges) - return [shard_range] if shard_range else [] + namespace = find_namespace(includes, namespaces) + return [namespace] if namespace else [] - def shard_range_filter(sr): + def namespace_filter(sr): end = start = True if end_marker: end = end_marker > sr.lower @@ -6343,79 +5643,13 @@ def filter_shard_ranges(shard_ranges, includes, marker, end_marker): return start and end if marker or end_marker: - return list(filter(shard_range_filter, shard_ranges)) + return list(filter(namespace_filter, namespaces)) if marker == Namespace.MAX or end_marker == Namespace.MIN: - # MIN and MAX are both Falsy so not handled by shard_range_filter + # MIN and MAX are both Falsy so not handled by namespace_filter return [] - return shard_ranges - - -def modify_priority(conf, logger): - """ - Modify priority by nice and ionice. - """ - - global _libc_setpriority - if _libc_setpriority is None: - _libc_setpriority = load_libc_function('setpriority', - errcheck=True) - - def _setpriority(nice_priority): - """ - setpriority for this pid - - :param nice_priority: valid values are -19 to 20 - """ - try: - _libc_setpriority(PRIO_PROCESS, os.getpid(), - int(nice_priority)) - except (ValueError, OSError): - print(_("WARNING: Unable to modify scheduling priority of process." - " Keeping unchanged! Check logs for more info. ")) - logger.exception('Unable to modify nice priority') - else: - logger.debug('set nice priority to %s' % nice_priority) - - nice_priority = conf.get('nice_priority') - if nice_priority is not None: - _setpriority(nice_priority) - - global _posix_syscall - if _posix_syscall is None: - _posix_syscall = load_libc_function('syscall', errcheck=True) - - def _ioprio_set(io_class, io_priority): - """ - ioprio_set for this process - - :param io_class: the I/O class component, can be - IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, - or IOPRIO_CLASS_IDLE - :param io_priority: priority value in the I/O class - """ - try: - io_class = IO_CLASS_ENUM[io_class] - io_priority = int(io_priority) - _posix_syscall(NR_ioprio_set(), - IOPRIO_WHO_PROCESS, - os.getpid(), - IOPRIO_PRIO_VALUE(io_class, io_priority)) - except (KeyError, ValueError, OSError): - print(_("WARNING: Unable to modify I/O scheduling class " - "and priority of process. Keeping unchanged! " - "Check logs for more info.")) - logger.exception("Unable to modify ionice priority") - else: - logger.debug('set ionice class %s priority %s', - io_class, io_priority) - - io_class = conf.get("ionice_class") - if io_class is None: - return - io_priority = conf.get("ionice_priority", 0) - _ioprio_set(io_class, io_priority) + return namespaces def o_tmpfile_in_path_supported(dirpath): @@ -6995,14 +6229,15 @@ class Watchdog(object): :param timeout: duration before the timeout expires :param exc: exception to throw when the timeout expire, must inherit - from eventlet.timeouts.Timeout + from eventlet.Timeout :param timeout_at: allow to force the expiration timestamp :return: id of the scheduled timeout, needed to cancel it """ + now = time.time() if not timeout_at: - timeout_at = time.time() + timeout + timeout_at = now + timeout gth = eventlet.greenthread.getcurrent() - timeout_definition = (timeout, timeout_at, gth, exc) + timeout_definition = (timeout, timeout_at, gth, exc, now) key = id(timeout_definition) self._timeouts[key] = timeout_definition @@ -7025,8 +6260,7 @@ class Watchdog(object): :param key: timeout id, as returned by start() """ try: - if key in self._timeouts: - del(self._timeouts[key]) + del(self._timeouts[key]) except KeyError: pass @@ -7046,15 +6280,14 @@ class Watchdog(object): self._next_expiration = None if self._evt.ready(): self._evt.reset() - for k, (timeout, timeout_at, gth, exc) in list(self._timeouts.items()): + for k, (timeout, timeout_at, gth, exc, + created_at) in list(self._timeouts.items()): if timeout_at <= now: - try: - if k in self._timeouts: - del(self._timeouts[k]) - except KeyError: - pass + self.stop(k) e = exc() + # set this after __init__ to keep it off the eventlet scheduler e.seconds = timeout + e.created_at = created_at eventlet.hubs.get_hub().schedule_call_global(0, gth.throw, e) else: if (self._next_expiration is None diff --git a/swift/common/utils/libc.py b/swift/common/utils/libc.py new file mode 100644 index 000000000..df2179020 --- /dev/null +++ b/swift/common/utils/libc.py @@ -0,0 +1,487 @@ +# Copyright (c) 2010-2023 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functions Swift uses to interact with libc and other low-level APIs.""" + +import ctypes +import ctypes.util +import errno +import fcntl +import logging +import os +import platform +import socket + + +# These are lazily pulled from libc elsewhere +_sys_fallocate = None +_posix_fadvise = None +_libc_socket = None +_libc_bind = None +_libc_accept = None +# see man -s 2 setpriority +_libc_setpriority = None +# see man -s 2 syscall +_posix_syscall = None + +# If set to non-zero, fallocate routines will fail based on free space +# available being at or below this amount, in bytes. +FALLOCATE_RESERVE = 0 +# Indicates if FALLOCATE_RESERVE is the percentage of free space (True) or +# the number of bytes (False). +FALLOCATE_IS_PERCENT = False + +# from /usr/include/linux/falloc.h +FALLOC_FL_KEEP_SIZE = 1 +FALLOC_FL_PUNCH_HOLE = 2 + +# from /usr/src/linux-headers-*/include/uapi/linux/resource.h +PRIO_PROCESS = 0 + + +# /usr/include/x86_64-linux-gnu/asm/unistd_64.h defines syscalls there +# are many like it, but this one is mine, see man -s 2 ioprio_set +def NR_ioprio_set(): + """Give __NR_ioprio_set value for your system.""" + architecture = os.uname()[4] + arch_bits = platform.architecture()[0] + # check if supported system, now support x86_64 and AArch64 + if architecture == 'x86_64' and arch_bits == '64bit': + return 251 + elif architecture == 'aarch64' and arch_bits == '64bit': + return 30 + raise OSError("Swift doesn't support ionice priority for %s %s" % + (architecture, arch_bits)) + + +# this syscall integer probably only works on x86_64 linux systems, you +# can check if it's correct on yours with something like this: +""" +#include <stdio.h> +#include <sys/syscall.h> + +int main(int argc, const char* argv[]) { + printf("%d\n", __NR_ioprio_set); + return 0; +} +""" + +# this is the value for "which" that says our who value will be a pid +# pulled out of /usr/src/linux-headers-*/include/linux/ioprio.h +IOPRIO_WHO_PROCESS = 1 + + +IO_CLASS_ENUM = { + 'IOPRIO_CLASS_RT': 1, + 'IOPRIO_CLASS_BE': 2, + 'IOPRIO_CLASS_IDLE': 3, +} + +# the IOPRIO_PRIO_VALUE "macro" is also pulled from +# /usr/src/linux-headers-*/include/linux/ioprio.h +IOPRIO_CLASS_SHIFT = 13 + + +def IOPRIO_PRIO_VALUE(class_, data): + return (((class_) << IOPRIO_CLASS_SHIFT) | data) + + +# These constants are Linux-specific, and Python doesn't seem to know +# about them. We ask anyway just in case that ever gets fixed. +# +# The values were copied from the Linux 3.x kernel headers. +AF_ALG = getattr(socket, 'AF_ALG', 38) +F_SETPIPE_SZ = getattr(fcntl, 'F_SETPIPE_SZ', 1031) + + +def noop_libc_function(*args): + return 0 + + +def load_libc_function(func_name, log_error=True, + fail_if_missing=False, errcheck=False): + """ + Attempt to find the function in libc, otherwise return a no-op func. + + :param func_name: name of the function to pull from libc. + :param log_error: log an error when a function can't be found + :param fail_if_missing: raise an exception when a function can't be found. + Default behavior is to return a no-op function. + :param errcheck: boolean, if true install a wrapper on the function + to check for a return values of -1 and call + ctype.get_errno and raise an OSError + """ + try: + libc = ctypes.CDLL(ctypes.util.find_library('c'), use_errno=True) + func = getattr(libc, func_name) + except AttributeError: + if fail_if_missing: + raise + if log_error: + logging.warning("Unable to locate %s in libc. Leaving as a " + "no-op.", func_name) + return noop_libc_function + if errcheck: + def _errcheck(result, f, args): + if result == -1: + errcode = ctypes.get_errno() + raise OSError(errcode, os.strerror(errcode)) + return result + func.errcheck = _errcheck + return func + + +class _LibcWrapper(object): + """ + A callable object that forwards its calls to a C function from libc. + + These objects are lazy. libc will not be checked until someone tries to + either call the function or check its availability. + + _LibcWrapper objects have an "available" property; if true, then libc + has the function of that name. If false, then calls will fail with a + NotImplementedError. + """ + + def __init__(self, func_name): + self._func_name = func_name + self._func_handle = None + self._loaded = False + + def _ensure_loaded(self): + if not self._loaded: + func_name = self._func_name + try: + # Keep everything in this try-block in local variables so + # that a typo in self.some_attribute_name doesn't raise a + # spurious AttributeError. + func_handle = load_libc_function( + func_name, fail_if_missing=True) + self._func_handle = func_handle + except AttributeError: + # We pass fail_if_missing=True to load_libc_function and + # then ignore the error. It's weird, but otherwise we have + # to check if self._func_handle is noop_libc_function, and + # that's even weirder. + pass + self._loaded = True + + @property + def available(self): + self._ensure_loaded() + return bool(self._func_handle) + + def __call__(self, *args): + if self.available: + return self._func_handle(*args) + else: + raise NotImplementedError( + "No function %r found in libc" % self._func_name) + + +def config_fallocate_value(reserve_value): + """ + Returns fallocate reserve_value as an int or float. + Returns is_percent as a boolean. + Returns a ValueError on invalid fallocate value. + """ + try: + if str(reserve_value[-1:]) == '%': + reserve_value = float(reserve_value[:-1]) + is_percent = True + else: + reserve_value = int(reserve_value) + is_percent = False + except ValueError: + raise ValueError('Error: %s is an invalid value for fallocate' + '_reserve.' % reserve_value) + return reserve_value, is_percent + + +_fallocate_enabled = True +_fallocate_warned_about_missing = False +_sys_fallocate = _LibcWrapper('fallocate') +_sys_posix_fallocate = _LibcWrapper('posix_fallocate') + + +def disable_fallocate(): + global _fallocate_enabled + _fallocate_enabled = False + + +def fallocate(fd, size, offset=0): + """ + Pre-allocate disk space for a file. + + This function can be disabled by calling disable_fallocate(). If no + suitable C function is available in libc, this function is a no-op. + + :param fd: file descriptor + :param size: size to allocate (in bytes) + """ + global _fallocate_enabled + if not _fallocate_enabled: + return + + if size < 0: + size = 0 # Done historically; not really sure why + if size >= (1 << 63): + raise ValueError('size must be less than 2 ** 63') + if offset < 0: + raise ValueError('offset must be non-negative') + if offset >= (1 << 63): + raise ValueError('offset must be less than 2 ** 63') + + # Make sure there's some (configurable) amount of free space in + # addition to the number of bytes we're allocating. + if FALLOCATE_RESERVE: + st = os.fstatvfs(fd) + free = st.f_frsize * st.f_bavail - size + if FALLOCATE_IS_PERCENT: + free = (float(free) / float(st.f_frsize * st.f_blocks)) * 100 + if float(free) <= float(FALLOCATE_RESERVE): + raise OSError( + errno.ENOSPC, + 'FALLOCATE_RESERVE fail %g <= %g' % + (free, FALLOCATE_RESERVE)) + + if _sys_fallocate.available: + # Parameters are (fd, mode, offset, length). + # + # mode=FALLOC_FL_KEEP_SIZE pre-allocates invisibly (without + # affecting the reported file size). + ret = _sys_fallocate( + fd, FALLOC_FL_KEEP_SIZE, ctypes.c_uint64(offset), + ctypes.c_uint64(size)) + err = ctypes.get_errno() + elif _sys_posix_fallocate.available: + # Parameters are (fd, offset, length). + ret = _sys_posix_fallocate(fd, ctypes.c_uint64(offset), + ctypes.c_uint64(size)) + err = ctypes.get_errno() + else: + # No suitable fallocate-like function is in our libc. Warn about it, + # but just once per process, and then do nothing. + global _fallocate_warned_about_missing + if not _fallocate_warned_about_missing: + logging.warning("Unable to locate fallocate, posix_fallocate in " + "libc. Leaving as a no-op.") + _fallocate_warned_about_missing = True + return + + if ret and err not in (0, errno.ENOSYS, errno.EOPNOTSUPP, + errno.EINVAL): + raise OSError(err, 'Unable to fallocate(%s)' % size) + + +def punch_hole(fd, offset, length): + """ + De-allocate disk space in the middle of a file. + + :param fd: file descriptor + :param offset: index of first byte to de-allocate + :param length: number of bytes to de-allocate + """ + if offset < 0: + raise ValueError('offset must be non-negative') + if offset >= (1 << 63): + raise ValueError('offset must be less than 2 ** 63') + if length <= 0: + raise ValueError('length must be positive') + if length >= (1 << 63): + raise ValueError('length must be less than 2 ** 63') + + if _sys_fallocate.available: + # Parameters are (fd, mode, offset, length). + ret = _sys_fallocate( + fd, + FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + ctypes.c_uint64(offset), + ctypes.c_uint64(length)) + err = ctypes.get_errno() + if ret and err: + mode_str = "FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE" + raise OSError(err, "Unable to fallocate(%d, %s, %d, %d)" % ( + fd, mode_str, offset, length)) + else: + raise OSError(errno.ENOTSUP, + 'No suitable C function found for hole punching') + + +def drop_buffer_cache(fd, offset, length): + """ + Drop 'buffer' cache for the given range of the given file. + + :param fd: file descriptor + :param offset: start offset + :param length: length + """ + global _posix_fadvise + if _posix_fadvise is None: + _posix_fadvise = load_libc_function('posix_fadvise64') + # 4 means "POSIX_FADV_DONTNEED" + ret = _posix_fadvise(fd, ctypes.c_uint64(offset), + ctypes.c_uint64(length), 4) + if ret != 0: + logging.warning("posix_fadvise64(%(fd)s, %(offset)s, %(length)s, 4) " + "-> %(ret)s", {'fd': fd, 'offset': offset, + 'length': length, 'ret': ret}) + + +class sockaddr_alg(ctypes.Structure): + _fields_ = [("salg_family", ctypes.c_ushort), + ("salg_type", ctypes.c_ubyte * 14), + ("salg_feat", ctypes.c_uint), + ("salg_mask", ctypes.c_uint), + ("salg_name", ctypes.c_ubyte * 64)] + + +_bound_md5_sockfd = None + + +def get_md5_socket(): + """ + Get an MD5 socket file descriptor. One can MD5 data with it by writing it + to the socket with os.write, then os.read the 16 bytes of the checksum out + later. + + NOTE: It is the caller's responsibility to ensure that os.close() is + called on the returned file descriptor. This is a bare file descriptor, + not a Python object. It doesn't close itself. + """ + + # Linux's AF_ALG sockets work like this: + # + # First, initialize a socket with socket() and bind(). This tells the + # socket what algorithm to use, as well as setting up any necessary bits + # like crypto keys. Of course, MD5 doesn't need any keys, so it's just the + # algorithm name. + # + # Second, to hash some data, get a second socket by calling accept() on + # the first socket. Write data to the socket, then when finished, read the + # checksum from the socket and close it. This lets you checksum multiple + # things without repeating all the setup code each time. + # + # Since we only need to bind() one socket, we do that here and save it for + # future re-use. That way, we only use one file descriptor to get an MD5 + # socket instead of two, and we also get to save some syscalls. + + global _bound_md5_sockfd + global _libc_socket + global _libc_bind + global _libc_accept + + if _libc_accept is None: + _libc_accept = load_libc_function('accept', fail_if_missing=True) + if _libc_socket is None: + _libc_socket = load_libc_function('socket', fail_if_missing=True) + if _libc_bind is None: + _libc_bind = load_libc_function('bind', fail_if_missing=True) + + # Do this at first call rather than at import time so that we don't use a + # file descriptor on systems that aren't using any MD5 sockets. + if _bound_md5_sockfd is None: + sockaddr_setup = sockaddr_alg( + AF_ALG, + (ord('h'), ord('a'), ord('s'), ord('h'), 0), + 0, 0, + (ord('m'), ord('d'), ord('5'), 0)) + hash_sockfd = _libc_socket(ctypes.c_int(AF_ALG), + ctypes.c_int(socket.SOCK_SEQPACKET), + ctypes.c_int(0)) + if hash_sockfd < 0: + raise IOError(ctypes.get_errno(), + "Failed to initialize MD5 socket") + + bind_result = _libc_bind(ctypes.c_int(hash_sockfd), + ctypes.pointer(sockaddr_setup), + ctypes.c_int(ctypes.sizeof(sockaddr_alg))) + if bind_result < 0: + os.close(hash_sockfd) + raise IOError(ctypes.get_errno(), "Failed to bind MD5 socket") + + _bound_md5_sockfd = hash_sockfd + + md5_sockfd = _libc_accept(ctypes.c_int(_bound_md5_sockfd), None, 0) + if md5_sockfd < 0: + raise IOError(ctypes.get_errno(), "Failed to accept MD5 socket") + + return md5_sockfd + + +def modify_priority(conf, logger): + """ + Modify priority by nice and ionice. + """ + + global _libc_setpriority + if _libc_setpriority is None: + _libc_setpriority = load_libc_function('setpriority', + errcheck=True) + + def _setpriority(nice_priority): + """ + setpriority for this pid + + :param nice_priority: valid values are -19 to 20 + """ + try: + _libc_setpriority(PRIO_PROCESS, os.getpid(), + int(nice_priority)) + except (ValueError, OSError): + print("WARNING: Unable to modify scheduling priority of process." + " Keeping unchanged! Check logs for more info. ") + logger.exception('Unable to modify nice priority') + else: + logger.debug('set nice priority to %s' % nice_priority) + + nice_priority = conf.get('nice_priority') + if nice_priority is not None: + _setpriority(nice_priority) + + global _posix_syscall + if _posix_syscall is None: + _posix_syscall = load_libc_function('syscall', errcheck=True) + + def _ioprio_set(io_class, io_priority): + """ + ioprio_set for this process + + :param io_class: the I/O class component, can be + IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, + or IOPRIO_CLASS_IDLE + :param io_priority: priority value in the I/O class + """ + try: + io_class = IO_CLASS_ENUM[io_class] + io_priority = int(io_priority) + _posix_syscall(NR_ioprio_set(), + IOPRIO_WHO_PROCESS, + os.getpid(), + IOPRIO_PRIO_VALUE(io_class, io_priority)) + except (KeyError, ValueError, OSError): + print("WARNING: Unable to modify I/O scheduling class " + "and priority of process. Keeping unchanged! " + "Check logs for more info.") + logger.exception("Unable to modify ionice priority") + else: + logger.debug('set ionice class %s priority %s', + io_class, io_priority) + + io_class = conf.get("ionice_class") + if io_class is None: + return + io_priority = conf.get("ionice_priority", 0) + _ioprio_set(io_class, io_priority) diff --git a/swift/common/utils/timestamp.py b/swift/common/utils/timestamp.py new file mode 100644 index 000000000..be83fe512 --- /dev/null +++ b/swift/common/utils/timestamp.py @@ -0,0 +1,399 @@ +# Copyright (c) 2010-2023 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Timestamp-related functions for use with Swift.""" + +import datetime +import functools +import math +import time + +import six + + +NORMAL_FORMAT = "%016.05f" +INTERNAL_FORMAT = NORMAL_FORMAT + '_%016x' +SHORT_FORMAT = NORMAL_FORMAT + '_%x' +MAX_OFFSET = (16 ** 16) - 1 +PRECISION = 1e-5 +# Setting this to True will cause the internal format to always display +# extended digits - even when the value is equivalent to the normalized form. +# This isn't ideal during an upgrade when some servers might not understand +# the new time format - but flipping it to True works great for testing. +FORCE_INTERNAL = False # or True + + +@functools.total_ordering +class Timestamp(object): + """ + Internal Representation of Swift Time. + + The normalized form of the X-Timestamp header looks like a float + with a fixed width to ensure stable string sorting - normalized + timestamps look like "1402464677.04188" + + To support overwrites of existing data without modifying the original + timestamp but still maintain consistency a second internal offset vector + is append to the normalized timestamp form which compares and sorts + greater than the fixed width float format but less than a newer timestamp. + The internalized format of timestamps looks like + "1402464677.04188_0000000000000000" - the portion after the underscore is + the offset and is a formatted hexadecimal integer. + + The internalized form is not exposed to clients in responses from + Swift. Normal client operations will not create a timestamp with an + offset. + + The Timestamp class in common.utils supports internalized and + normalized formatting of timestamps and also comparison of timestamp + values. When the offset value of a Timestamp is 0 - it's considered + insignificant and need not be represented in the string format; to + support backwards compatibility during a Swift upgrade the + internalized and normalized form of a Timestamp with an + insignificant offset are identical. When a timestamp includes an + offset it will always be represented in the internalized form, but + is still excluded from the normalized form. Timestamps with an + equivalent timestamp portion (the float part) will compare and order + by their offset. Timestamps with a greater timestamp portion will + always compare and order greater than a Timestamp with a lesser + timestamp regardless of it's offset. String comparison and ordering + is guaranteed for the internalized string format, and is backwards + compatible for normalized timestamps which do not include an offset. + """ + + def __init__(self, timestamp, offset=0, delta=0, check_bounds=True): + """ + Create a new Timestamp. + + :param timestamp: time in seconds since the Epoch, may be any of: + + * a float or integer + * normalized/internalized string + * another instance of this class (offset is preserved) + + :param offset: the second internal offset vector, an int + :param delta: deca-microsecond difference from the base timestamp + param, an int + """ + if isinstance(timestamp, bytes): + timestamp = timestamp.decode('ascii') + if isinstance(timestamp, six.string_types): + base, base_offset = timestamp.partition('_')[::2] + self.timestamp = float(base) + if '_' in base_offset: + raise ValueError('invalid literal for int() with base 16: ' + '%r' % base_offset) + if base_offset: + self.offset = int(base_offset, 16) + else: + self.offset = 0 + else: + self.timestamp = float(timestamp) + self.offset = getattr(timestamp, 'offset', 0) + # increment offset + if offset >= 0: + self.offset += offset + else: + raise ValueError('offset must be non-negative') + if self.offset > MAX_OFFSET: + raise ValueError('offset must be smaller than %d' % MAX_OFFSET) + self.raw = int(round(self.timestamp / PRECISION)) + # add delta + if delta: + self.raw = self.raw + delta + if self.raw <= 0: + raise ValueError( + 'delta must be greater than %d' % (-1 * self.raw)) + self.timestamp = float(self.raw * PRECISION) + if check_bounds: + if self.timestamp < 0: + raise ValueError('timestamp cannot be negative') + if self.timestamp >= 10000000000: + raise ValueError('timestamp too large') + + @classmethod + def now(cls, offset=0, delta=0): + return cls(time.time(), offset=offset, delta=delta) + + def __repr__(self): + return INTERNAL_FORMAT % (self.timestamp, self.offset) + + def __str__(self): + raise TypeError('You must specify which string format is required') + + def __float__(self): + return self.timestamp + + def __int__(self): + return int(self.timestamp) + + def __nonzero__(self): + return bool(self.timestamp or self.offset) + + def __bool__(self): + return self.__nonzero__() + + @property + def normal(self): + return NORMAL_FORMAT % self.timestamp + + @property + def internal(self): + if self.offset or FORCE_INTERNAL: + return INTERNAL_FORMAT % (self.timestamp, self.offset) + else: + return self.normal + + @property + def short(self): + if self.offset or FORCE_INTERNAL: + return SHORT_FORMAT % (self.timestamp, self.offset) + else: + return self.normal + + @property + def isoformat(self): + """ + Get an isoformat string representation of the 'normal' part of the + Timestamp with microsecond precision and no trailing timezone, for + example:: + + 1970-01-01T00:00:00.000000 + + :return: an isoformat string + """ + t = float(self.normal) + if six.PY3: + # On Python 3, round manually using ROUND_HALF_EVEN rounding + # method, to use the same rounding method than Python 2. Python 3 + # used a different rounding method, but Python 3.4.4 and 3.5.1 use + # again ROUND_HALF_EVEN as Python 2. + # See https://bugs.python.org/issue23517 + frac, t = math.modf(t) + us = round(frac * 1e6) + if us >= 1000000: + t += 1 + us -= 1000000 + elif us < 0: + t -= 1 + us += 1000000 + dt = datetime.datetime.utcfromtimestamp(t) + dt = dt.replace(microsecond=us) + else: + dt = datetime.datetime.utcfromtimestamp(t) + + isoformat = dt.isoformat() + # python isoformat() doesn't include msecs when zero + if len(isoformat) < len("1970-01-01T00:00:00.000000"): + isoformat += ".000000" + return isoformat + + @classmethod + def from_isoformat(cls, date_string): + """ + Parse an isoformat string representation of time to a Timestamp object. + + :param date_string: a string formatted as per an Timestamp.isoformat + property. + :return: an instance of this class. + """ + start = datetime.datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S.%f") + delta = start - EPOCH + # This calculation is based on Python 2.7's Modules/datetimemodule.c, + # function delta_to_microseconds(), but written in Python. + return cls(delta.total_seconds()) + + def ceil(self): + """ + Return the 'normal' part of the timestamp rounded up to the nearest + integer number of seconds. + + This value should be used whenever the second-precision Last-Modified + time of a resource is required. + + :return: a float value with second precision. + """ + return math.ceil(float(self)) + + def __eq__(self, other): + if other is None: + return False + if not isinstance(other, Timestamp): + try: + other = Timestamp(other, check_bounds=False) + except ValueError: + return False + return self.internal == other.internal + + def __ne__(self, other): + return not (self == other) + + def __lt__(self, other): + if other is None: + return False + if not isinstance(other, Timestamp): + other = Timestamp(other, check_bounds=False) + if other.timestamp < 0: + return False + if other.timestamp >= 10000000000: + return True + return self.internal < other.internal + + def __hash__(self): + return hash(self.internal) + + def __invert__(self): + if self.offset: + raise ValueError('Cannot invert timestamps with offsets') + return Timestamp((999999999999999 - self.raw) * PRECISION) + + +def encode_timestamps(t1, t2=None, t3=None, explicit=False): + """ + Encode up to three timestamps into a string. Unlike a Timestamp object, the + encoded string does NOT used fixed width fields and consequently no + relative chronology of the timestamps can be inferred from lexicographic + sorting of encoded timestamp strings. + + The format of the encoded string is: + <t1>[<+/-><t2 - t1>[<+/-><t3 - t2>]] + + i.e. if t1 = t2 = t3 then just the string representation of t1 is returned, + otherwise the time offsets for t2 and t3 are appended. If explicit is True + then the offsets for t2 and t3 are always appended even if zero. + + Note: any offset value in t1 will be preserved, but offsets on t2 and t3 + are not preserved. In the anticipated use cases for this method (and the + inverse decode_timestamps method) the timestamps passed as t2 and t3 are + not expected to have offsets as they will be timestamps associated with a + POST request. In the case where the encoding is used in a container objects + table row, t1 could be the PUT or DELETE time but t2 and t3 represent the + content type and metadata times (if different from the data file) i.e. + correspond to POST timestamps. In the case where the encoded form is used + in a .meta file name, t1 and t2 both correspond to POST timestamps. + """ + form = '{0}' + values = [t1.short] + if t2 is not None: + t2_t1_delta = t2.raw - t1.raw + explicit = explicit or (t2_t1_delta != 0) + values.append(t2_t1_delta) + if t3 is not None: + t3_t2_delta = t3.raw - t2.raw + explicit = explicit or (t3_t2_delta != 0) + values.append(t3_t2_delta) + if explicit: + form += '{1:+x}' + if t3 is not None: + form += '{2:+x}' + return form.format(*values) + + +def decode_timestamps(encoded, explicit=False): + """ + Parses a string of the form generated by encode_timestamps and returns + a tuple of the three component timestamps. If explicit is False, component + timestamps that are not explicitly encoded will be assumed to have zero + delta from the previous component and therefore take the value of the + previous component. If explicit is True, component timestamps that are + not explicitly encoded will be returned with value None. + """ + # TODO: some tests, e.g. in test_replicator, put float timestamps values + # into container db's, hence this defensive check, but in real world + # this may never happen. + if not isinstance(encoded, six.string_types): + ts = Timestamp(encoded) + return ts, ts, ts + + parts = [] + signs = [] + pos_parts = encoded.split('+') + for part in pos_parts: + # parse time components and their signs + # e.g. x-y+z --> parts = [x, y, z] and signs = [+1, -1, +1] + neg_parts = part.split('-') + parts = parts + neg_parts + signs = signs + [1] + [-1] * (len(neg_parts) - 1) + t1 = Timestamp(parts[0]) + t2 = t3 = None + if len(parts) > 1: + t2 = t1 + delta = signs[1] * int(parts[1], 16) + # if delta = 0 we want t2 = t3 = t1 in order to + # preserve any offset in t1 - only construct a distinct + # timestamp if there is a non-zero delta. + if delta: + t2 = Timestamp((t1.raw + delta) * PRECISION) + elif not explicit: + t2 = t1 + if len(parts) > 2: + t3 = t2 + delta = signs[2] * int(parts[2], 16) + if delta: + t3 = Timestamp((t2.raw + delta) * PRECISION) + elif not explicit: + t3 = t2 + return t1, t2, t3 + + +def normalize_timestamp(timestamp): + """ + Format a timestamp (string or numeric) into a standardized + xxxxxxxxxx.xxxxx (10.5) format. + + Note that timestamps using values greater than or equal to November 20th, + 2286 at 17:46 UTC will use 11 digits to represent the number of + seconds. + + :param timestamp: unix timestamp + :returns: normalized timestamp as a string + """ + return Timestamp(timestamp).normal + + +EPOCH = datetime.datetime(1970, 1, 1) + + +def last_modified_date_to_timestamp(last_modified_date_str): + """ + Convert a last modified date (like you'd get from a container listing, + e.g. 2014-02-28T23:22:36.698390) to a float. + """ + return Timestamp.from_isoformat(last_modified_date_str) + + +def normalize_delete_at_timestamp(timestamp, high_precision=False): + """ + Format a timestamp (string or numeric) into a standardized + xxxxxxxxxx (10) or xxxxxxxxxx.xxxxx (10.5) format. + + Note that timestamps less than 0000000000 are raised to + 0000000000 and values greater than November 20th, 2286 at + 17:46:39 UTC will be capped at that date and time, resulting in + no return value exceeding 9999999999.99999 (or 9999999999 if + using low-precision). + + This cap is because the expirer is already working through a + sorted list of strings that were all a length of 10. Adding + another digit would mess up the sort and cause the expirer to + break from processing early. By 2286, this problem will need to + be fixed, probably by creating an additional .expiring_objects + account to work from with 11 (or more) digit container names. + + :param timestamp: unix timestamp + :returns: normalized timestamp as a string + """ + fmt = '%016.5f' if high_precision else '%010d' + return fmt % min(max(0, float(timestamp)), 9999999999.99999) |