From d019ed198d90c5dee2b5dd5f39ffa922263f50e4 Mon Sep 17 00:00:00 2001 From: "martin f. krafft" Date: Thu, 26 Jan 2023 18:46:06 +0100 Subject: Keep @modifiers when parsing locales (#947) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Locale modifiers ("@variants") are described in the GNU gettext documentation like this: > The ‘@variant’ can denote any kind of characteristics that is not > already implied by the language ll and the country CC. […] It can also > denote a dialect of the language, … Wherein Babel previously would discard these, this patch stores the modifier information in the `Locale` objects, handling string representation accordingly. Resolves: #946 Signed-off-by: martin f. krafft Co-authored-by: Aarni Koskela --- babel/core.py | 115 ++++++++++++++++++++++++++++++++++++++--------------- tests/test_core.py | 6 ++- 2 files changed, 88 insertions(+), 33 deletions(-) diff --git a/babel/core.py b/babel/core.py index 6b0c45d..bdd176a 100644 --- a/babel/core.py +++ b/babel/core.py @@ -168,6 +168,7 @@ class Locale: territory: str | None = None, script: str | None = None, variant: str | None = None, + modifier: str | None = None, ) -> None: """Initialize the locale object from the given identifier components. @@ -181,6 +182,7 @@ class Locale: :param territory: the territory (country or region) code :param script: the script code :param variant: the variant code + :param modifier: a modifier (following the '@' symbol, sometimes called '@variant') :raise `UnknownLocaleError`: if no locale data is available for the requested locale """ @@ -192,10 +194,13 @@ class Locale: self.script = script #: the variant code self.variant = variant + #: the modifier + self.modifier = modifier self.__data = None identifier = str(self) - if not localedata.exists(identifier): + identifier_without_modifier = identifier.partition('@')[0] + if not localedata.exists(identifier_without_modifier): raise UnknownLocaleError(identifier) @classmethod @@ -290,6 +295,11 @@ class Locale: >>> Locale.parse('und_AT') Locale('de', territory='AT') + Modifiers are optional, and always at the end, separated by "@": + + >>> Locale.parse('de_AT@euro') + Locale('de', territory='AT', modifier='euro') + :param identifier: the locale identifier string :param sep: optional component separator :param resolve_likely_subtags: if this is specified then a locale will @@ -348,7 +358,11 @@ class Locale: # implement ICU like fuzzy locale objects and provide a way to # maximize and minimize locale tags. - language, territory, script, variant = parts + if len(parts) == 5: + language, territory, script, variant, modifier = parts + else: + language, territory, script, variant = parts + modifier = None language = get_global('language_aliases').get(language, language) territory = get_global('territory_aliases').get(territory, (territory,))[0] script = get_global('script_aliases').get(script, script) @@ -359,7 +373,7 @@ class Locale: if script == 'Zzzz': script = None - parts = language, territory, script, variant + parts = language, territory, script, variant, modifier # First match: try the whole identifier new_id = get_locale_identifier(parts) @@ -373,33 +387,40 @@ class Locale: # simplified identifier that is just the language likely_subtag = get_global('likely_subtags').get(language) if likely_subtag is not None: - language2, _, script2, variant2 = parse_locale(likely_subtag) - locale = _try_load_reducing((language2, territory, script2, variant2)) + parts2 = parse_locale(likely_subtag) + if len(parts2) == 5: + language2, _, script2, variant2, modifier2 = parse_locale(likely_subtag) + else: + language2, _, script2, variant2 = parse_locale(likely_subtag) + modifier2 = None + locale = _try_load_reducing((language2, territory, script2, variant2, modifier2)) if locale is not None: return locale raise UnknownLocaleError(input_id) def __eq__(self, other: object) -> bool: - for key in ('language', 'territory', 'script', 'variant'): + for key in ('language', 'territory', 'script', 'variant', 'modifier'): if not hasattr(other, key): return False return ( self.language == getattr(other, 'language') and # noqa: B009 self.territory == getattr(other, 'territory') and # noqa: B009 self.script == getattr(other, 'script') and # noqa: B009 - self.variant == getattr(other, 'variant') # noqa: B009 + self.variant == getattr(other, 'variant') and # noqa: B009 + self.modifier == getattr(other, 'modifier') # noqa: B009 ) def __ne__(self, other: object) -> bool: return not self.__eq__(other) def __hash__(self) -> int: - return hash((self.language, self.territory, self.script, self.variant)) + return hash((self.language, self.territory, self.script, + self.variant, self.modifier)) def __repr__(self) -> str: parameters = [''] - for key in ('territory', 'script', 'variant'): + for key in ('territory', 'script', 'variant', 'modifier'): value = getattr(self, key) if value is not None: parameters.append(f"{key}={value!r}") @@ -407,7 +428,8 @@ class Locale: def __str__(self) -> str: return get_locale_identifier((self.language, self.territory, - self.script, self.variant)) + self.script, self.variant, + self.modifier)) @property def _data(self) -> localedata.LocaleDataDict: @@ -424,6 +446,11 @@ class Locale: >>> Locale('zh', 'CN', script='Hans').get_display_name('en') u'Chinese (Simplified, China)' + Modifiers are currently passed through verbatim: + + >>> Locale('it', 'IT', modifier='euro').get_display_name('en') + u'Italian (Italy, euro)' + :param locale: the locale to use """ if locale is None: @@ -438,6 +465,8 @@ class Locale: details.append(locale.territories.get(self.territory)) if self.variant: details.append(locale.variants.get(self.variant)) + if self.modifier: + details.append(self.modifier) details = filter(None, details) if details: retval += f" ({', '.join(details)})" @@ -1115,9 +1144,12 @@ def negotiate_locale(preferred: Iterable[str], available: Iterable[str], sep: st return None -def parse_locale(identifier: str, sep: str = '_') -> tuple[str, str | None, str | None, str | None]: +def parse_locale( + identifier: str, + sep: str = '_' +) -> tuple[str, str | None, str | None, str | None, str | None]: """Parse a locale identifier into a tuple of the form ``(language, - territory, script, variant)``. + territory, script, variant, modifier)``. >>> parse_locale('zh_CN') ('zh', 'CN', None, None) @@ -1129,12 +1161,22 @@ def parse_locale(identifier: str, sep: str = '_') -> tuple[str, str | None, str ('en', '150', None, None) >>> parse_locale('en_us_posix') ('en', 'US', None, 'POSIX') + >>> parse_locale('it_IT@euro') + ('it', 'IT', None, None, 'euro') + >>> parse_locale('it_IT@custom') + ('it', 'IT', None, None, 'custom') + >>> parse_locale('it_IT@') + ('it', 'IT', None, None) The default component separator is "_", but a different separator can be - specified using the `sep` parameter: + specified using the `sep` parameter. + + The optional modifier is always separated with "@" and at the end: >>> parse_locale('zh-CN', sep='-') ('zh', 'CN', None, None) + >>> parse_locale('zh-CN@custom', sep='-') + ('zh', 'CN', None, None, 'custom') If the identifier cannot be parsed into a locale, a `ValueError` exception is raised: @@ -1144,14 +1186,13 @@ def parse_locale(identifier: str, sep: str = '_') -> tuple[str, str | None, str ... ValueError: 'not_a_LOCALE_String' is not a valid locale identifier - Encoding information and locale modifiers are removed from the identifier: + Encoding information is removed from the identifier, while modifiers are + kept: - >>> parse_locale('it_IT@euro') - ('it', 'IT', None, None) >>> parse_locale('en_US.UTF-8') ('en', 'US', None, None) >>> parse_locale('de_DE.iso885915@euro') - ('de', 'DE', None, None) + ('de', 'DE', None, None, 'euro') See :rfc:`4646` for more information. @@ -1161,13 +1202,10 @@ def parse_locale(identifier: str, sep: str = '_') -> tuple[str, str | None, str :raise `ValueError`: if the string does not appear to be a valid locale identifier """ + identifier, _, modifier = identifier.partition('@') if '.' in identifier: # this is probably the charset/encoding, which we don't care about identifier = identifier.split('.', 1)[0] - if '@' in identifier: - # this is a locale modifier such as @euro, which we don't care about - # either - identifier = identifier.split('@', 1)[0] parts = identifier.split(sep) lang = parts.pop(0).lower() @@ -1193,22 +1231,37 @@ def parse_locale(identifier: str, sep: str = '_') -> tuple[str, str | None, str if parts: raise ValueError(f"{identifier!r} is not a valid locale identifier") - return lang, territory, script, variant - - -def get_locale_identifier(tup: tuple[str, str | None, str | None, str | None], sep: str = '_') -> str: + # TODO(3.0): always return a 5-tuple + if modifier: + return lang, territory, script, variant, modifier + else: + return lang, territory, script, variant + + +def get_locale_identifier( + tup: tuple[str] + | tuple[str, str | None] + | tuple[str, str | None, str | None] + | tuple[str, str | None, str | None, str | None] + | tuple[str, str | None, str | None, str | None, str | None], + sep: str = "_", +) -> str: """The reverse of :func:`parse_locale`. It creates a locale identifier out - of a ``(language, territory, script, variant)`` tuple. Items can be set to + of a ``(language, territory, script, variant, modifier)`` tuple. Items can be set to ``None`` and trailing ``None``\\s can also be left out of the tuple. - >>> get_locale_identifier(('de', 'DE', None, '1999')) - 'de_DE_1999' + >>> get_locale_identifier(('de', 'DE', None, '1999', 'custom')) + 'de_DE_1999@custom' + >>> get_locale_identifier(('fi', None, None, None, 'custom')) + 'fi@custom' + .. versionadded:: 1.0 :param tup: the tuple as returned by :func:`parse_locale`. :param sep: the separator for the identifier. """ - tup = tuple(tup[:4]) - lang, territory, script, variant = tup + (None,) * (4 - len(tup)) - return sep.join(filter(None, (lang, script, territory, variant))) + tup = tuple(tup[:5]) + lang, territory, script, variant, modifier = tup + (None,) * (5 - len(tup)) + ret = sep.join(filter(None, (lang, script, territory, variant))) + return f'{ret}@{modifier}' if modifier else ret diff --git a/tests/test_core.py b/tests/test_core.py index 2a7e605..0023824 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -283,10 +283,12 @@ def test_parse_locale(): assert (excinfo.value.args[0] == "'not_a_LOCALE_String' is not a valid locale identifier") - assert core.parse_locale('it_IT@euro') == ('it', 'IT', None, None) + assert core.parse_locale('it_IT@euro') == ('it', 'IT', None, None, 'euro') + assert core.parse_locale('it_IT@something') == ('it', 'IT', None, None, 'something') + assert core.parse_locale('en_US.UTF-8') == ('en', 'US', None, None) assert (core.parse_locale('de_DE.iso885915@euro') == - ('de', 'DE', None, None)) + ('de', 'DE', None, None, 'euro')) @pytest.mark.parametrize('filename', [ -- cgit v1.2.1