diff options
Diffstat (limited to 'Lib/locale.py')
-rw-r--r-- | Lib/locale.py | 175 |
1 files changed, 120 insertions, 55 deletions
diff --git a/Lib/locale.py b/Lib/locale.py index d2a885d519..7cfea618d7 100644 --- a/Lib/locale.py +++ b/Lib/locale.py @@ -336,6 +336,22 @@ def _test(): # overridden below) _setlocale = setlocale +def _replace_encoding(code, encoding): + if '.' in code: + langname = code[:code.index('.')] + else: + langname = code + # Convert the encoding to a C lib compatible encoding string + norm_encoding = encodings.normalize_encoding(encoding) + #print('norm encoding: %r' % norm_encoding) + norm_encoding = encodings.aliases.aliases.get(norm_encoding, + norm_encoding) + #print('aliased encoding: %r' % norm_encoding) + encoding = locale_encoding_alias.get(norm_encoding, + norm_encoding) + #print('found encoding %r' % encoding) + return langname + '.' + encoding + def normalize(localename): """ Returns a normalized locale code for the given locale @@ -352,55 +368,71 @@ def normalize(localename): does. """ - # Normalize the locale name and extract the encoding - fullname = localename.lower() - if ':' in fullname: + # Normalize the locale name and extract the encoding and modifier + code = localename.lower() + if ':' in code: # ':' is sometimes used as encoding delimiter. - fullname = fullname.replace(':', '.') - if '.' in fullname: - langname, encoding = fullname.split('.')[:2] - fullname = langname + '.' + encoding + code = code.replace(':', '.') + if '@' in code: + code, modifier = code.split('@', 1) else: - langname = fullname + modifier = '' + if '.' in code: + langname, encoding = code.split('.')[:2] + else: + langname = code encoding = '' - # First lookup: fullname (possibly with encoding) - norm_encoding = encoding.replace('-', '') - norm_encoding = norm_encoding.replace('_', '') - lookup_name = langname + '.' + encoding + # First lookup: fullname (possibly with encoding and modifier) + lang_enc = langname + if encoding: + norm_encoding = encoding.replace('-', '') + norm_encoding = norm_encoding.replace('_', '') + lang_enc += '.' + norm_encoding + lookup_name = lang_enc + if modifier: + lookup_name += '@' + modifier code = locale_alias.get(lookup_name, None) if code is not None: return code - #print 'first lookup failed' - - # Second try: langname (without encoding) - code = locale_alias.get(langname, None) - if code is not None: - #print 'langname lookup succeeded' - if '.' in code: - langname, defenc = code.split('.') - else: - langname = code - defenc = '' - if encoding: - # Convert the encoding to a C lib compatible encoding string - norm_encoding = encodings.normalize_encoding(encoding) - #print 'norm encoding: %r' % norm_encoding - norm_encoding = encodings.aliases.aliases.get(norm_encoding, - norm_encoding) - #print 'aliased encoding: %r' % norm_encoding - encoding = locale_encoding_alias.get(norm_encoding, - norm_encoding) - else: - encoding = defenc - #print 'found encoding %r' % encoding - if encoding: - return langname + '.' + encoding - else: - return langname - - else: - return localename + #print('first lookup failed') + + if modifier: + # Second try: fullname without modifier (possibly with encoding) + code = locale_alias.get(lang_enc, None) + if code is not None: + #print('lookup without modifier succeeded') + if '@' not in code: + return code + '@' + modifier + if code.split('@', 1)[1].lower() == modifier: + return code + #print('second lookup failed') + + if encoding: + # Third try: langname (without encoding, possibly with modifier) + lookup_name = langname + if modifier: + lookup_name += '@' + modifier + code = locale_alias.get(lookup_name, None) + if code is not None: + #print('lookup without encoding succeeded') + if '@' not in code: + return _replace_encoding(code, encoding) + code, modifier = code.split('@', 1) + return _replace_encoding(code, encoding) + '@' + modifier + + if modifier: + # Fourth try: langname (without encoding and modifier) + code = locale_alias.get(langname, None) + if code is not None: + #print('lookup without modifier and encoding succeeded') + if '@' not in code: + return _replace_encoding(code, encoding) + '@' + modifier + code, defmod = code.split('@', 1) + if defmod.lower() == modifier: + return _replace_encoding(code, encoding) + '@' + defmod + + return localename def _parse_localename(localename): @@ -419,7 +451,7 @@ def _parse_localename(localename): code = normalize(localename) if '@' in code: # Deal with locale modifiers - code, modifier = code.split('@') + code, modifier = code.split('@', 1) if modifier == 'euro' and '.' not in code: # Assume Latin-9 for @euro locales. This is bogus, # since some systems may use other encodings for these @@ -743,11 +775,30 @@ locale_encoding_alias = { # updated 'sr_yu.utf8@cyrillic' -> 'sr_CS.UTF-8' to 'sr_RS.UTF-8' # updated 'sr_yu@cyrillic' -> 'sr_CS.ISO8859-5' to 'sr_RS.UTF-8' # +# SS 2013-12-20: +# Updated alias mapping to most recent locale.alias file +# from X.org distribution using makelocalealias.py. +# +# These are the differences compared to the old mapping (Python 3.3.3 +# and older): +# +# updated 'a3' -> 'a3_AZ.KOI8-C' to 'az_AZ.KOI8-C' +# updated 'a3_az' -> 'a3_AZ.KOI8-C' to 'az_AZ.KOI8-C' +# updated 'a3_az.koi8c' -> 'a3_AZ.KOI8-C' to 'az_AZ.KOI8-C' +# updated 'cs_cs.iso88592' -> 'cs_CS.ISO8859-2' to 'cs_CZ.ISO8859-2' +# updated 'hebrew' -> 'iw_IL.ISO8859-8' to 'he_IL.ISO8859-8' +# updated 'hebrew.iso88598' -> 'iw_IL.ISO8859-8' to 'he_IL.ISO8859-8' +# updated 'sd' -> 'sd_IN@devanagari.UTF-8' to 'sd_IN.UTF-8' +# updated 'sr@latn' -> 'sr_RS.UTF-8@latin' to 'sr_CS.UTF-8@latin' +# updated 'sr_cs' -> 'sr_RS.UTF-8' to 'sr_CS.UTF-8' +# updated 'sr_cs.utf8@latn' -> 'sr_RS.UTF-8@latin' to 'sr_CS.UTF-8@latin' +# updated 'sr_cs@latn' -> 'sr_RS.UTF-8@latin' to 'sr_CS.UTF-8@latin' locale_alias = { - 'a3': 'a3_AZ.KOI8-C', - 'a3_az': 'a3_AZ.KOI8-C', - 'a3_az.koi8c': 'a3_AZ.KOI8-C', + 'a3': 'az_AZ.KOI8-C', + 'a3_az': 'az_AZ.KOI8-C', + 'a3_az.koi8c': 'az_AZ.KOI8-C', + 'a3_az.koic': 'az_AZ.KOI8-C', 'af': 'af_ZA.ISO8859-1', 'af_za': 'af_ZA.ISO8859-1', 'af_za.iso88591': 'af_ZA.ISO8859-1', @@ -766,6 +817,7 @@ locale_alias = { 'ar_dz.iso88596': 'ar_DZ.ISO8859-6', 'ar_eg': 'ar_EG.ISO8859-6', 'ar_eg.iso88596': 'ar_EG.ISO8859-6', + 'ar_in': 'ar_IN.UTF-8', 'ar_iq': 'ar_IQ.ISO8859-6', 'ar_iq.iso88596': 'ar_IQ.ISO8859-6', 'ar_jo': 'ar_JO.ISO8859-6', @@ -795,6 +847,7 @@ locale_alias = { 'arabic': 'ar_AA.ISO8859-6', 'arabic.iso88596': 'ar_AA.ISO8859-6', 'as': 'as_IN.UTF-8', + 'as_in': 'as_IN.UTF-8', 'az': 'az_AZ.ISO8859-9E', 'az_az': 'az_AZ.ISO8859-9E', 'az_az.iso88599e': 'az_AZ.ISO8859-9E', @@ -812,6 +865,7 @@ locale_alias = { 'bg_bg.koi8r': 'bg_BG.KOI8-R', 'bg_bg.microsoftcp1251': 'bg_BG.CP1251', 'bn_in': 'bn_IN.UTF-8', + 'bo_in': 'bo_IN.UTF-8', 'bokmal': 'nb_NO.ISO8859-1', 'bokm\xe5l': 'nb_NO.ISO8859-1', 'br': 'br_FR.ISO8859-1', @@ -829,6 +883,7 @@ locale_alias = { 'c': 'C', 'c-french': 'fr_CA.ISO8859-1', 'c-french.iso88591': 'fr_CA.ISO8859-1', + 'c.ascii': 'C', 'c.en': 'C', 'c.iso88591': 'en_US.ISO8859-1', 'c_c': 'C', @@ -866,7 +921,7 @@ locale_alias = { 'croatian': 'hr_HR.ISO8859-2', 'cs': 'cs_CZ.ISO8859-2', 'cs_cs': 'cs_CZ.ISO8859-2', - 'cs_cs.iso88592': 'cs_CS.ISO8859-2', + 'cs_cs.iso88592': 'cs_CZ.ISO8859-2', 'cs_cz': 'cs_CZ.ISO8859-2', 'cs_cz.iso88592': 'cs_CZ.ISO8859-2', 'cy': 'cy_GB.ISO8859-1', @@ -1170,12 +1225,13 @@ locale_alias = { 'he_il.cp1255': 'he_IL.CP1255', 'he_il.iso88598': 'he_IL.ISO8859-8', 'he_il.microsoftcp1255': 'he_IL.CP1255', - 'hebrew': 'iw_IL.ISO8859-8', - 'hebrew.iso88598': 'iw_IL.ISO8859-8', + 'hebrew': 'he_IL.ISO8859-8', + 'hebrew.iso88598': 'he_IL.ISO8859-8', 'hi': 'hi_IN.ISCII-DEV', 'hi_in': 'hi_IN.ISCII-DEV', 'hi_in.isciidev': 'hi_IN.ISCII-DEV', 'hne': 'hne_IN.UTF-8', + 'hne_in': 'hne_IN.UTF-8', 'hr': 'hr_HR.ISO8859-2', 'hr_hr': 'hr_HR.ISO8859-2', 'hr_hr.iso88592': 'hr_HR.ISO8859-2', @@ -1263,7 +1319,8 @@ locale_alias = { 'korean': 'ko_KR.eucKR', 'korean.euc': 'ko_KR.eucKR', 'ks': 'ks_IN.UTF-8', - 'ks_in@devanagari': 'ks_IN@devanagari.UTF-8', + 'ks_in': 'ks_IN.UTF-8', + 'ks_in@devanagari': 'ks_IN.UTF-8@devanagari', 'kw': 'kw_GB.ISO8859-1', 'kw_gb': 'kw_GB.ISO8859-1', 'kw_gb.iso88591': 'kw_GB.ISO8859-1', @@ -1287,6 +1344,7 @@ locale_alias = { 'lv_lv.iso885913': 'lv_LV.ISO8859-13', 'lv_lv.iso88594': 'lv_LV.ISO8859-4', 'mai': 'mai_IN.UTF-8', + 'mai_in': 'mai_IN.UTF-8', 'mi': 'mi_NZ.ISO8859-1', 'mi_nz': 'mi_NZ.ISO8859-1', 'mi_nz.iso88591': 'mi_NZ.ISO8859-1', @@ -1296,6 +1354,7 @@ locale_alias = { 'mk_mk.iso88595': 'mk_MK.ISO8859-5', 'mk_mk.microsoftcp1251': 'mk_MK.CP1251', 'ml': 'ml_IN.UTF-8', + 'ml_in': 'ml_IN.UTF-8', 'mr': 'mr_IN.UTF-8', 'mr_in': 'mr_IN.UTF-8', 'ms': 'ms_MY.ISO8859-1', @@ -1310,6 +1369,7 @@ locale_alias = { 'nb_no.iso88591': 'nb_NO.ISO8859-1', 'nb_no.iso885915': 'nb_NO.ISO8859-15', 'nb_no@euro': 'nb_NO.ISO8859-15', + 'ne_np': 'ne_NP.UTF-8', 'nl': 'nl_NL.ISO8859-1', 'nl.iso885915': 'nl_NL.ISO8859-15', 'nl_be': 'nl_BE.ISO8859-1', @@ -1362,6 +1422,7 @@ locale_alias = { 'oc_fr.iso885915': 'oc_FR.ISO8859-15', 'oc_fr@euro': 'oc_FR.ISO8859-15', 'or': 'or_IN.UTF-8', + 'or_in': 'or_IN.UTF-8', 'pa': 'pa_IN.UTF-8', 'pa_in': 'pa_IN.UTF-8', 'pd': 'pd_US.ISO8859-1', @@ -1423,7 +1484,10 @@ locale_alias = { 'rw': 'rw_RW.ISO8859-1', 'rw_rw': 'rw_RW.ISO8859-1', 'rw_rw.iso88591': 'rw_RW.ISO8859-1', - 'sd': 'sd_IN@devanagari.UTF-8', + 'sd': 'sd_IN.UTF-8', + 'sd@devanagari': 'sd_IN.UTF-8@devanagari', + 'sd_in': 'sd_IN.UTF-8', + 'sd_in@devanagari': 'sd_IN.UTF-8@devanagari', 'se_no': 'se_NO.UTF-8', 'serbocroatian': 'sr_RS.UTF-8@latin', 'sh': 'sr_RS.UTF-8@latin', @@ -1457,13 +1521,13 @@ locale_alias = { 'sr': 'sr_RS.UTF-8', 'sr@cyrillic': 'sr_RS.UTF-8', 'sr@latin': 'sr_RS.UTF-8@latin', - 'sr@latn': 'sr_RS.UTF-8@latin', - 'sr_cs': 'sr_RS.UTF-8', + 'sr@latn': 'sr_CS.UTF-8@latin', + 'sr_cs': 'sr_CS.UTF-8', 'sr_cs.iso88592': 'sr_CS.ISO8859-2', 'sr_cs.iso88592@latn': 'sr_CS.ISO8859-2', 'sr_cs.iso88595': 'sr_CS.ISO8859-5', - 'sr_cs.utf8@latn': 'sr_RS.UTF-8@latin', - 'sr_cs@latn': 'sr_RS.UTF-8@latin', + 'sr_cs.utf8@latn': 'sr_CS.UTF-8@latin', + 'sr_cs@latn': 'sr_CS.UTF-8@latin', 'sr_me': 'sr_ME.UTF-8', 'sr_rs': 'sr_RS.UTF-8', 'sr_rs.utf8@latn': 'sr_RS.UTF-8@latin', @@ -1541,6 +1605,7 @@ locale_alias = { 'universal': 'en_US.utf', 'universal.utf8@ucs4': 'en_US.UTF-8', 'ur': 'ur_PK.CP1256', + 'ur_in': 'ur_IN.UTF-8', 'ur_pk': 'ur_PK.CP1256', 'ur_pk.cp1256': 'ur_PK.CP1256', 'ur_pk.microsoftcp1256': 'ur_PK.CP1256', |