summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormartin f. krafft <github.com@pobox.madduck.net>2023-01-26 18:46:06 +0100
committerGitHub <noreply@github.com>2023-01-26 19:46:06 +0200
commitd019ed198d90c5dee2b5dd5f39ffa922263f50e4 (patch)
treebb999836bd4e83b263dbbe0f1c3b51abbdc0a109
parent6bf793ae5b902b3e3f2cb494f2fc230c2641890c (diff)
downloadbabel-d019ed198d90c5dee2b5dd5f39ffa922263f50e4.tar.gz
Keep @modifiers when parsing locales (#947)
Locale modifiers ("@variants") are described in the GNU gettext documentation like this: > The ‘@variant’ can denote any kind of characteristics that is not > already implied by the language ll and the country CC. […] It can also > denote a dialect of the language, … Wherein Babel previously would discard these, this patch stores the modifier information in the `Locale` objects, handling string representation accordingly. Resolves: #946 Signed-off-by: martin f. krafft <madduck@madduck.net> Co-authored-by: Aarni Koskela <akx@iki.fi>
-rw-r--r--babel/core.py115
-rw-r--r--tests/test_core.py6
2 files changed, 88 insertions, 33 deletions
diff --git a/babel/core.py b/babel/core.py
index 6b0c45d..bdd176a 100644
--- a/babel/core.py
+++ b/babel/core.py
@@ -168,6 +168,7 @@ class Locale:
territory: str | None = None,
script: str | None = None,
variant: str | None = None,
+ modifier: str | None = None,
) -> None:
"""Initialize the locale object from the given identifier components.
@@ -181,6 +182,7 @@ class Locale:
:param territory: the territory (country or region) code
:param script: the script code
:param variant: the variant code
+ :param modifier: a modifier (following the '@' symbol, sometimes called '@variant')
:raise `UnknownLocaleError`: if no locale data is available for the
requested locale
"""
@@ -192,10 +194,13 @@ class Locale:
self.script = script
#: the variant code
self.variant = variant
+ #: the modifier
+ self.modifier = modifier
self.__data = None
identifier = str(self)
- if not localedata.exists(identifier):
+ identifier_without_modifier = identifier.partition('@')[0]
+ if not localedata.exists(identifier_without_modifier):
raise UnknownLocaleError(identifier)
@classmethod
@@ -290,6 +295,11 @@ class Locale:
>>> Locale.parse('und_AT')
Locale('de', territory='AT')
+ Modifiers are optional, and always at the end, separated by "@":
+
+ >>> Locale.parse('de_AT@euro')
+ Locale('de', territory='AT', modifier='euro')
+
:param identifier: the locale identifier string
:param sep: optional component separator
:param resolve_likely_subtags: if this is specified then a locale will
@@ -348,7 +358,11 @@ class Locale:
# implement ICU like fuzzy locale objects and provide a way to
# maximize and minimize locale tags.
- language, territory, script, variant = parts
+ if len(parts) == 5:
+ language, territory, script, variant, modifier = parts
+ else:
+ language, territory, script, variant = parts
+ modifier = None
language = get_global('language_aliases').get(language, language)
territory = get_global('territory_aliases').get(territory, (territory,))[0]
script = get_global('script_aliases').get(script, script)
@@ -359,7 +373,7 @@ class Locale:
if script == 'Zzzz':
script = None
- parts = language, territory, script, variant
+ parts = language, territory, script, variant, modifier
# First match: try the whole identifier
new_id = get_locale_identifier(parts)
@@ -373,33 +387,40 @@ class Locale:
# simplified identifier that is just the language
likely_subtag = get_global('likely_subtags').get(language)
if likely_subtag is not None:
- language2, _, script2, variant2 = parse_locale(likely_subtag)
- locale = _try_load_reducing((language2, territory, script2, variant2))
+ parts2 = parse_locale(likely_subtag)
+ if len(parts2) == 5:
+ language2, _, script2, variant2, modifier2 = parse_locale(likely_subtag)
+ else:
+ language2, _, script2, variant2 = parse_locale(likely_subtag)
+ modifier2 = None
+ locale = _try_load_reducing((language2, territory, script2, variant2, modifier2))
if locale is not None:
return locale
raise UnknownLocaleError(input_id)
def __eq__(self, other: object) -> bool:
- for key in ('language', 'territory', 'script', 'variant'):
+ for key in ('language', 'territory', 'script', 'variant', 'modifier'):
if not hasattr(other, key):
return False
return (
self.language == getattr(other, 'language') and # noqa: B009
self.territory == getattr(other, 'territory') and # noqa: B009
self.script == getattr(other, 'script') and # noqa: B009
- self.variant == getattr(other, 'variant') # noqa: B009
+ self.variant == getattr(other, 'variant') and # noqa: B009
+ self.modifier == getattr(other, 'modifier') # noqa: B009
)
def __ne__(self, other: object) -> bool:
return not self.__eq__(other)
def __hash__(self) -> int:
- return hash((self.language, self.territory, self.script, self.variant))
+ return hash((self.language, self.territory, self.script,
+ self.variant, self.modifier))
def __repr__(self) -> str:
parameters = ['']
- for key in ('territory', 'script', 'variant'):
+ for key in ('territory', 'script', 'variant', 'modifier'):
value = getattr(self, key)
if value is not None:
parameters.append(f"{key}={value!r}")
@@ -407,7 +428,8 @@ class Locale:
def __str__(self) -> str:
return get_locale_identifier((self.language, self.territory,
- self.script, self.variant))
+ self.script, self.variant,
+ self.modifier))
@property
def _data(self) -> localedata.LocaleDataDict:
@@ -424,6 +446,11 @@ class Locale:
>>> Locale('zh', 'CN', script='Hans').get_display_name('en')
u'Chinese (Simplified, China)'
+ Modifiers are currently passed through verbatim:
+
+ >>> Locale('it', 'IT', modifier='euro').get_display_name('en')
+ u'Italian (Italy, euro)'
+
:param locale: the locale to use
"""
if locale is None:
@@ -438,6 +465,8 @@ class Locale:
details.append(locale.territories.get(self.territory))
if self.variant:
details.append(locale.variants.get(self.variant))
+ if self.modifier:
+ details.append(self.modifier)
details = filter(None, details)
if details:
retval += f" ({', '.join(details)})"
@@ -1115,9 +1144,12 @@ def negotiate_locale(preferred: Iterable[str], available: Iterable[str], sep: st
return None
-def parse_locale(identifier: str, sep: str = '_') -> tuple[str, str | None, str | None, str | None]:
+def parse_locale(
+ identifier: str,
+ sep: str = '_'
+) -> tuple[str, str | None, str | None, str | None, str | None]:
"""Parse a locale identifier into a tuple of the form ``(language,
- territory, script, variant)``.
+ territory, script, variant, modifier)``.
>>> parse_locale('zh_CN')
('zh', 'CN', None, None)
@@ -1129,12 +1161,22 @@ def parse_locale(identifier: str, sep: str = '_') -> tuple[str, str | None, str
('en', '150', None, None)
>>> parse_locale('en_us_posix')
('en', 'US', None, 'POSIX')
+ >>> parse_locale('it_IT@euro')
+ ('it', 'IT', None, None, 'euro')
+ >>> parse_locale('it_IT@custom')
+ ('it', 'IT', None, None, 'custom')
+ >>> parse_locale('it_IT@')
+ ('it', 'IT', None, None)
The default component separator is "_", but a different separator can be
- specified using the `sep` parameter:
+ specified using the `sep` parameter.
+
+ The optional modifier is always separated with "@" and at the end:
>>> parse_locale('zh-CN', sep='-')
('zh', 'CN', None, None)
+ >>> parse_locale('zh-CN@custom', sep='-')
+ ('zh', 'CN', None, None, 'custom')
If the identifier cannot be parsed into a locale, a `ValueError` exception
is raised:
@@ -1144,14 +1186,13 @@ def parse_locale(identifier: str, sep: str = '_') -> tuple[str, str | None, str
...
ValueError: 'not_a_LOCALE_String' is not a valid locale identifier
- Encoding information and locale modifiers are removed from the identifier:
+ Encoding information is removed from the identifier, while modifiers are
+ kept:
- >>> parse_locale('it_IT@euro')
- ('it', 'IT', None, None)
>>> parse_locale('en_US.UTF-8')
('en', 'US', None, None)
>>> parse_locale('de_DE.iso885915@euro')
- ('de', 'DE', None, None)
+ ('de', 'DE', None, None, 'euro')
See :rfc:`4646` for more information.
@@ -1161,13 +1202,10 @@ def parse_locale(identifier: str, sep: str = '_') -> tuple[str, str | None, str
:raise `ValueError`: if the string does not appear to be a valid locale
identifier
"""
+ identifier, _, modifier = identifier.partition('@')
if '.' in identifier:
# this is probably the charset/encoding, which we don't care about
identifier = identifier.split('.', 1)[0]
- if '@' in identifier:
- # this is a locale modifier such as @euro, which we don't care about
- # either
- identifier = identifier.split('@', 1)[0]
parts = identifier.split(sep)
lang = parts.pop(0).lower()
@@ -1193,22 +1231,37 @@ def parse_locale(identifier: str, sep: str = '_') -> tuple[str, str | None, str
if parts:
raise ValueError(f"{identifier!r} is not a valid locale identifier")
- return lang, territory, script, variant
-
-
-def get_locale_identifier(tup: tuple[str, str | None, str | None, str | None], sep: str = '_') -> str:
+ # TODO(3.0): always return a 5-tuple
+ if modifier:
+ return lang, territory, script, variant, modifier
+ else:
+ return lang, territory, script, variant
+
+
+def get_locale_identifier(
+ tup: tuple[str]
+ | tuple[str, str | None]
+ | tuple[str, str | None, str | None]
+ | tuple[str, str | None, str | None, str | None]
+ | tuple[str, str | None, str | None, str | None, str | None],
+ sep: str = "_",
+) -> str:
"""The reverse of :func:`parse_locale`. It creates a locale identifier out
- of a ``(language, territory, script, variant)`` tuple. Items can be set to
+ of a ``(language, territory, script, variant, modifier)`` tuple. Items can be set to
``None`` and trailing ``None``\\s can also be left out of the tuple.
- >>> get_locale_identifier(('de', 'DE', None, '1999'))
- 'de_DE_1999'
+ >>> get_locale_identifier(('de', 'DE', None, '1999', 'custom'))
+ 'de_DE_1999@custom'
+ >>> get_locale_identifier(('fi', None, None, None, 'custom'))
+ 'fi@custom'
+
.. versionadded:: 1.0
:param tup: the tuple as returned by :func:`parse_locale`.
:param sep: the separator for the identifier.
"""
- tup = tuple(tup[:4])
- lang, territory, script, variant = tup + (None,) * (4 - len(tup))
- return sep.join(filter(None, (lang, script, territory, variant)))
+ tup = tuple(tup[:5])
+ lang, territory, script, variant, modifier = tup + (None,) * (5 - len(tup))
+ ret = sep.join(filter(None, (lang, script, territory, variant)))
+ return f'{ret}@{modifier}' if modifier else ret
diff --git a/tests/test_core.py b/tests/test_core.py
index 2a7e605..0023824 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -283,10 +283,12 @@ def test_parse_locale():
assert (excinfo.value.args[0] ==
"'not_a_LOCALE_String' is not a valid locale identifier")
- assert core.parse_locale('it_IT@euro') == ('it', 'IT', None, None)
+ assert core.parse_locale('it_IT@euro') == ('it', 'IT', None, None, 'euro')
+ assert core.parse_locale('it_IT@something') == ('it', 'IT', None, None, 'something')
+
assert core.parse_locale('en_US.UTF-8') == ('en', 'US', None, None)
assert (core.parse_locale('de_DE.iso885915@euro') ==
- ('de', 'DE', None, None))
+ ('de', 'DE', None, None, 'euro'))
@pytest.mark.parametrize('filename', [