diff options
-rw-r--r-- | ext/standard/basic_functions.c | 5 | ||||
-rw-r--r-- | ext/standard/basic_functions.h | 4 | ||||
-rw-r--r-- | ext/standard/html.c | 1133 | ||||
-rw-r--r-- | ext/standard/html_tables.h | 2080 | ||||
-rw-r--r-- | ext/standard/tests/strings/get_html_translation_table_basic1.phpt | 8 | ||||
-rw-r--r-- | ext/standard/tests/strings/html_entity_decode_cp866.phpt | 533 | ||||
-rw-r--r-- | ext/standard/tests/strings/html_entity_decode_iso8859-15.phpt | 405 | ||||
-rw-r--r-- | ext/standard/tests/strings/html_entity_decode_iso8859-5.phpt | 405 | ||||
-rw-r--r-- | ext/standard/tests/strings/html_entity_decode_koi8-r.phpt | 533 | ||||
-rw-r--r-- | ext/standard/tests/strings/html_entity_decode_macroman.phpt | 540 | ||||
-rw-r--r-- | ext/standard/tests/strings/html_entity_decode_win1251.phpt | 537 | ||||
-rw-r--r-- | ext/standard/tests/strings/html_entity_decode_win1252.phpt | 169 | ||||
-rw-r--r-- | ext/standard/tests/strings/htmlentities17.phpt | 3 |
13 files changed, 5690 insertions, 665 deletions
diff --git a/ext/standard/basic_functions.c b/ext/standard/basic_functions.c index 96201955be..a70a5b222b 100644 --- a/ext/standard/basic_functions.c +++ b/ext/standard/basic_functions.c @@ -3432,6 +3432,7 @@ static void basic_globals_ctor(php_basic_globals *basic_globals_p TSRMLS_DC) /* BG(left) = -1; BG(user_tick_functions) = NULL; BG(user_filter_map) = NULL; + BG(inverse_ent_maps) = NULL; memset(&BG(serialize), 0, sizeof(BG(serialize))); memset(&BG(unserialize), 0, sizeof(BG(unserialize))); @@ -3454,6 +3455,10 @@ static void basic_globals_dtor(php_basic_globals *basic_globals_p TSRMLS_DC) /* zend_hash_destroy(BG(url_adapt_state_ex).tags); free(BG(url_adapt_state_ex).tags); } + if (BG(inverse_ent_maps)) { + zend_hash_destroy(BG(inverse_ent_maps)); + pefree(BG(inverse_ent_maps), 1); + } } /* }}} */ diff --git a/ext/standard/basic_functions.h b/ext/standard/basic_functions.h index 4498e6cf8f..edc5846e0a 100644 --- a/ext/standard/basic_functions.h +++ b/ext/standard/basic_functions.h @@ -220,6 +220,10 @@ typedef struct _php_basic_globals { HashTable *user_filter_map; + /* html.c */ + /* map entities to characters. Stores hash table pointers for each charset */ + HashTable *inverse_ent_maps; + /* file.c */ #if defined(_REENTRANT) && defined(HAVE_MBRLEN) && defined(HAVE_MBSTATE_T) mbstate_t mblen_state; diff --git a/ext/standard/html.c b/ext/standard/html.c index 7a14f6b0ad..0ad34e52c4 100644 --- a/ext/standard/html.c +++ b/ext/standard/html.c @@ -14,7 +14,8 @@ +----------------------------------------------------------------------+ | Authors: Rasmus Lerdorf <rasmus@php.net> | | Jaakko Hyvätti <jaakko.hyvatti@iki.fi> | - | Wez Furlong <wez@thebrainroom.com> | + | Wez Furlong <wez@thebrainroom.com> | + | Gustavo Lopes <cataphract@php.net> | +----------------------------------------------------------------------+ */ @@ -28,7 +29,11 @@ * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT * * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2 - * + * + * From HTML 4.01 strict DTD: + * http://www.w3.org/TR/html4/HTMLlat1.ent + * http://www.w3.org/TR/html4/HTMLsymbol.ent + * http://www.w3.org/TR/html4/HTMLspecial.ent */ #include "php.h" @@ -37,7 +42,7 @@ #else #include <php_config.h> #endif -#include "html.h" +#include "php_standard.h" #include "php_string.h" #include "SAPI.h" #if HAVE_LOCALE_H @@ -52,424 +57,8 @@ ZEND_EXTERN_MODULE_GLOBALS(mbstring) #endif -enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252, - cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, - cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r, - cs_cp1251, cs_8859_5, cs_cp866, cs_macroman - }; -typedef const char *const entity_table_t; - -/* codepage 1252 is a Windows extension to iso-8859-1. */ -static entity_table_t ent_cp_1252[] = { - "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger", - "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig", - NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo", - "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo", - "oelig", NULL, NULL, "Yuml" -}; - -static entity_table_t ent_iso_8859_1[] = { - "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar", - "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg", - "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro", - "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14", - "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc", - "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", - "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", - "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", - "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", - "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", - "atilde", "auml", "aring", "aelig", "ccedil", "egrave", - "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", - "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", - "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", - "uuml", "yacute", "thorn", "yuml" -}; - -static entity_table_t ent_iso_8859_15[] = { - "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron", - "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg", - "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */ - "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm", - "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute", - "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", - "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", - "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", - "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", - "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", - "atilde", "auml", "aring", "aelig", "ccedil", "egrave", - "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", - "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", - "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", - "uuml", "yacute", "thorn", "yuml" -}; - -static entity_table_t ent_uni_338_402[] = { - /* 338 (0x0152) */ - "OElig", "oelig", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 352 (0x0160) */ - "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 376 (0x0178) */ - "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 400 (0x0190) */ - NULL, NULL, "fnof" -}; - -static entity_table_t ent_uni_spacing[] = { - /* 710 */ - "circ", - /* 711 - 730 */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 731 - 732 */ - NULL, "tilde" -}; - -static entity_table_t ent_uni_greek[] = { - /* 913 */ - "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta", - "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho", - NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega", - /* 938 - 944 are not mapped */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", - "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", - "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega", - /* 970 - 976 are not mapped */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - "thetasym", "upsih", - NULL, NULL, NULL, - "piv" -}; - -static entity_table_t ent_uni_punct[] = { - /* 8194 */ - "ensp", "emsp", NULL, NULL, NULL, NULL, NULL, - "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm", - NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL, - /* 8216 */ - "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL, - "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL, - /* 8242 */ - "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL, - NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL, - "frasl" -}; - -static entity_table_t ent_uni_euro[] = { - "euro" -}; - -static entity_table_t ent_uni_8465_8501[] = { - /* 8465 */ - "image", NULL, NULL, NULL, NULL, NULL, NULL, - /* 8472 */ - "weierp", NULL, NULL, NULL, - /* 8476 */ - "real", NULL, NULL, NULL, NULL, NULL, - /* 8482 */ - "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8501 */ - "alefsym", -}; - -static entity_table_t ent_uni_8592_9002[] = { - /* 8592 (0x2190) */ - "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8608 (0x21a0) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8624 (0x21b0) */ - NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8640 (0x21c0) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8656 (0x21d0) */ - "lArr", "uArr", "rArr", "dArr", "hArr", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8672 (0x21e0) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8704 (0x2200) */ - "forall", NULL, "part", "exist", NULL, "empty", NULL, "nabla", - "isin", "notin", NULL, "ni", NULL, NULL, NULL, "prod", - /* 8720 (0x2210) */ - NULL, "sum", "minus", NULL, NULL, NULL, NULL, "lowast", - NULL, NULL, "radic", NULL, NULL, "prop", "infin", NULL, - /* 8736 (0x2220) */ - "ang", NULL, NULL, NULL, NULL, NULL, NULL, "and", - "or", "cap", "cup", "int", NULL, NULL, NULL, NULL, - /* 8752 (0x2230) */ - NULL, NULL, NULL, NULL, "there4", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, "sim", NULL, NULL, NULL, - /* 8768 (0x2240) */ - NULL, NULL, NULL, NULL, NULL, "cong", NULL, NULL, - "asymp", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8784 (0x2250) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8800 (0x2260) */ - "ne", "equiv", NULL, NULL, "le", "ge", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8816 (0x2270) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8832 (0x2280) */ - NULL, NULL, "sub", "sup", "nsub", NULL, "sube", "supe", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8848 (0x2290) */ - NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8864 (0x22a0) */ - NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8880 (0x22b0) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8896 (0x22c0) */ - NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8912 (0x22d0) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8928 (0x22e0) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8944 (0x22f0) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8960 (0x2300) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL, - /* 8976 (0x2310) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - /* 8992 (0x2320) */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, "lang", "rang" -}; - -static entity_table_t ent_uni_9674[] = { - /* 9674 */ - "loz" -}; - -static entity_table_t ent_uni_9824_9830[] = { - /* 9824 */ - "spades", NULL, NULL, "clubs", NULL, "hearts", "diams" -}; - -static entity_table_t ent_koi8r[] = { - "#1105", /* "jo "*/ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092", - "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084", - "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090", - "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096", - "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041", - "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048", - "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055", - "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042", - "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063", - "#1066" -}; - -static entity_table_t ent_cp_1251[] = { - "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger", - "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036", - "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220", - "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250", - "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118", - "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy", - "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn", - "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105", - "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111", - "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046", - "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053", - "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060", - "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067", - "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074", - "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081", - "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088", - "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095", - "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102", - "#1103" -}; - -static entity_table_t ent_iso_8859_5[] = { - "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062", - "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069", - "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076", - "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083", - "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090", - "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", - "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104", - "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111", - "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118", - "#1119" -}; - -static entity_table_t ent_cp_866[] = { - - "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566", - "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552", - "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560", - "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608", - "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090", - "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", - "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025", - "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118", - "#176", "#8729", "#183", "#8730", "#8470", "#164", "#9632", - "#160" -}; - -/* MacRoman has a couple of low-ascii chars that need mapping too */ -/* Vertical tab (ASCII 11) is often used to store line breaks inside */ -/* DB exports, this mapping changes it to a space */ -static entity_table_t ent_macroman[] = { - "sp", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, "quot", NULL, - NULL, NULL, "amp", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "lt", NULL, "gt", NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml", - "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring", - "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave", - "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml", - "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg", - "cent", "pound", "sect", "bull", "para", "szlig", "reg", - "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash", - "infin", "plusmn", "le", "ge", "yen", "micro", "part", - "sum", "prod", "pi", "int", "ordf", "ordm", "Omega", - "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof", - "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave", - "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo", - "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml", - "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger", - "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute", - "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute", - "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305", - "circ", "tilde", "macr", "#728", "#729", "#730", "cedil", - "#733", "#731", "#711" -}; - -struct html_entity_map { - enum entity_charset charset; /* charset identifier */ - unsigned int basechar; /* char code at start of table */ - unsigned int endchar; /* last char code in the table */ - entity_table_t *table; /* the table of mappings */ -}; - -static const struct html_entity_map entity_map[] = { - { cs_cp1252, 0x80, 0x9f, ent_cp_1252 }, - { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 }, - { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_utf_8, 338, 402, ent_uni_338_402 }, - { cs_utf_8, 710, 732, ent_uni_spacing }, - { cs_utf_8, 913, 982, ent_uni_greek }, - { cs_utf_8, 8194, 8260, ent_uni_punct }, - { cs_utf_8, 8364, 8364, ent_uni_euro }, - { cs_utf_8, 8465, 8501, ent_uni_8465_8501 }, - { cs_utf_8, 8592, 9002, ent_uni_8592_9002 }, - { cs_utf_8, 9674, 9674, ent_uni_9674 }, - { cs_utf_8, 9824, 9830, ent_uni_9824_9830 }, - { cs_big5, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 }, - { cs_koi8r, 0xa3, 0xff, ent_koi8r }, - { cs_cp1251, 0x80, 0xff, ent_cp_1251 }, - { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 }, - { cs_cp866, 0xc0, 0xff, ent_cp_866 }, - { cs_macroman, 0x0b, 0xff, ent_macroman }, - { cs_terminator } -}; - -static const struct { - const char *codeset; - enum entity_charset charset; -} charset_map[] = { - { "ISO-8859-1", cs_8859_1 }, - { "ISO8859-1", cs_8859_1 }, - { "ISO-8859-15", cs_8859_15 }, - { "ISO8859-15", cs_8859_15 }, - { "utf-8", cs_utf_8 }, - { "cp1252", cs_cp1252 }, - { "Windows-1252", cs_cp1252 }, - { "1252", cs_cp1252 }, - { "BIG5", cs_big5 }, - { "950", cs_big5 }, - { "GB2312", cs_gb2312 }, - { "936", cs_gb2312 }, - { "BIG5-HKSCS", cs_big5hkscs }, - { "Shift_JIS", cs_sjis }, - { "SJIS", cs_sjis }, - { "932", cs_sjis }, - { "EUCJP", cs_eucjp }, - { "EUC-JP", cs_eucjp }, - { "KOI8-R", cs_koi8r }, - { "koi8-ru", cs_koi8r }, - { "koi8r", cs_koi8r }, - { "cp1251", cs_cp1251 }, - { "Windows-1251", cs_cp1251 }, - { "win-1251", cs_cp1251 }, - { "iso8859-5", cs_8859_5 }, - { "iso-8859-5", cs_8859_5 }, - { "cp866", cs_cp866 }, - { "866", cs_cp866 }, - { "ibm866", cs_cp866 }, - { "MacRoman", cs_macroman }, - { NULL } -}; - -static const struct { - unsigned short charcode; - char *entity; - int entitylen; - int flags; -} basic_entities[] = { - { '"', """, 6, ENT_HTML_QUOTE_DOUBLE }, - { '\'', "'", 6, ENT_HTML_QUOTE_SINGLE }, - { '\'', "'", 5, ENT_HTML_QUOTE_SINGLE }, - { '<', "<", 4, 0 }, - { '>', ">", 4, 0 }, - { 0, NULL, 0, 0 } -}; - -struct basic_entities_dec { - unsigned short charcode; - char entity[8]; - int entitylen; -}; - +#include "html_tables.h" + #define MB_RETURN { \ *newpos = pos; \ mbseq[mbpos] = '\0'; \ @@ -871,6 +460,8 @@ size_t php_utf32_utf8(unsigned char *buf, int k) { size_t retval = 0; + /* assert(0x0 <= k <= 0x10FFFF); */ + if (k < 0x80) { buf[0] = k; retval = 1; @@ -883,226 +474,492 @@ size_t php_utf32_utf8(unsigned char *buf, int k) buf[1] = 0x80 | ((k >> 6) & 0x3f); buf[2] = 0x80 | (k & 0x3f); retval = 3; - } else if (k < 0x200000) { + } else { buf[0] = 0xf0 | (k >> 18); buf[1] = 0x80 | ((k >> 12) & 0x3f); buf[2] = 0x80 | ((k >> 6) & 0x3f); buf[3] = 0x80 | (k & 0x3f); retval = 4; - } else if (k < 0x4000000) { - buf[0] = 0xf8 | (k >> 24); - buf[1] = 0x80 | ((k >> 18) & 0x3f); - buf[2] = 0x80 | ((k >> 12) & 0x3f); - buf[3] = 0x80 | ((k >> 6) & 0x3f); - buf[4] = 0x80 | (k & 0x3f); - retval = 5; - } else { - buf[0] = 0xfc | (k >> 30); - buf[1] = 0x80 | ((k >> 24) & 0x3f); - buf[2] = 0x80 | ((k >> 18) & 0x3f); - buf[3] = 0x80 | ((k >> 12) & 0x3f); - buf[4] = 0x80 | ((k >> 6) & 0x3f); - buf[5] = 0x80 | (k & 0x3f); - retval = 6; } - buf[retval] = '\0'; + /* UTF-8 has been restricted to max 4 bytes since RFC 3629 */ return retval; } /* }}} */ -/* {{{ php_unescape_html_entities +/* {{{ unimap_bsearc_cmp + * Binary search of unicode code points in unicode <--> charset mapping. + * Returns the code point in the target charset (whose mapping table was given) or 0 if + * the unicode code point is not in the table. */ -PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC) +static unsigned char unimap_bsearch(const unicode_mapping *table, unsigned code_key_a, size_t num) { - int retlen, j; - unsigned int k; - char *replaced, *ret, *p, *q, *lim, *next; - enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC); - unsigned char replacement[15]; - int replacement_len; + const unicode_mapping *l = table, + *h = &table[num-1], + *m; + unsigned short code_key; + + /* we have no mappings outside the BMP */ + if (code_key_a > 0xFFFFU) + return 0; + + code_key = (unsigned short) code_key_a; + + while (l <= h) { + m = l + (h - l) / 2; + if (code_key < m->un_code_point) + h = m - 1; + else if (code_key > m->un_code_point) + l = m + 1; + else + return m->cs_code; + } + return 0; +} +/* }}} */ - ret = estrndup(old, oldlen); - retlen = oldlen; - if (!retlen) { - goto empty_source; +/* {{{ map_from_unicode */ +static int map_from_unicode(unsigned code, enum entity_charset charset, unsigned *res) +{ + unsigned char found; + const unicode_mapping *table; + size_t table_size; + + switch (charset) { + case cs_8859_1: + /* identity mapping of code points to unicode */ + if (code > 0xFF) { + return FAILURE; + } + *res = code; + break; + + case cs_8859_5: + if (code <= 0xA0 || code == 0xAD /* soft hyphen */) { + *res = code; + } else if (code == 0x2116) { + *res = 0xF0; /* numero sign */ + } else if (code == 0xA7) { + *res = 0xFD; /* section sign */ + } else if (code >= 0x0401 && code <= 0x044F) { + if (code == 0x040D || code == 0x0450 || code == 0x045D) + return FAILURE; + *res = code - 0x360; + } else { + return FAILURE; + } + break; + + case cs_8859_15: + if (code < 0xA4 || (code > 0xBE && code <= 0xFF)) { + *res = code; + } else { /* between A4 and 0xBE */ + found = unimap_bsearch(unimap_iso885915, + code, sizeof(unimap_iso885915) / sizeof(*unimap_iso885915)); + if (found) + *res = found; + else + return FAILURE; + } + break; + + case cs_cp1252: + if (code <= 0x7F || (code >= 0xA0 && code <= 0xFF)) { + *res = code; + } else { + found = unimap_bsearch(unimap_win1252, + code, sizeof(unimap_win1252) / sizeof(*unimap_win1252)); + if (found) + *res = found; + else + return FAILURE; + } + break; + + case cs_macroman: + if (code == 0x7F) + return FAILURE; + table = unimap_macroman; + table_size = sizeof(unimap_macroman) / sizeof(*unimap_macroman); + goto table_over_7F; + case cs_cp1251: + table = unimap_win1251; + table_size = sizeof(unimap_win1251) / sizeof(*unimap_win1251); + goto table_over_7F; + case cs_koi8r: + table = unimap_koi8r; + table_size = sizeof(unimap_koi8r) / sizeof(*unimap_koi8r); + goto table_over_7F; + case cs_cp866: + table = unimap_cp866; + table_size = sizeof(unimap_cp866) / sizeof(*unimap_cp866); + +table_over_7F: + if (code <= 0x7F) { + *res = code; + } else { + found = unimap_bsearch(table, code, table_size); + if (found) + *res = found; + else + return FAILURE; + } + break; + + /* from here on, only map the possible characters in the ASCII range. + * to improve support here, it's a matter of building the unicode mappings. + * See <http://www.unicode.org/Public/6.0.0/ucd/Unihan.zip> */ + case cs_sjis: + case cs_eucjp: + if (code >= 0x20 && code <= 0x7D) { + if (code == 0x5C) /* 0x5C is mapped to the yen symbol */ + return FAILURE; + *res = code; + } else { + return FAILURE; + } + break; + + case cs_big5: + case cs_big5hkscs: + case cs_gb2312: + if (code >= 0x20 && code <= 0x7D) { + *res = code; + } else { + return FAILURE; + } + break; + + default: + return FAILURE; } - - if (all) { - /* look for a match in the maps for this charset */ - for (j = 0; entity_map[j].charset != cs_terminator; j++) { - if (entity_map[j].charset != charset) - continue; - for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) { - unsigned char entity[32]; - int entity_length = 0; + return SUCCESS; +} +/* }}} */ - if (entity_map[j].table[k - entity_map[j].basechar] == NULL) - continue; +/* {{{ process_numeric_entity + * Auxiliary function to traverse_for_entities. + * On input, *buf should point to the first character after # and on output, it's the last + * byte read, no matter if there was success or insuccess. + */ +static int process_numeric_entity(char **buf, unsigned *code_point, int all) +{ + long code_l; + int hexadecimal = (**buf == 'x' || **buf == 'X'); - entity_length = slprintf(entity, sizeof(entity), "&%s;", entity_map[j].table[k - entity_map[j].basechar]); - if (entity_length >= sizeof(entity)) { - continue; - } + if (hexadecimal) + (*buf)++; + + /* strtol allows whitespace and other stuff in the beginning + * we're not interested */ + if (hexadecimal && !isxdigit(**buf) || + !hexadecimal && !isdigit(**buf)) { + return FAILURE; + } - /* When we have MBCS entities in the tables above, this will need to handle it */ - replacement_len = 0; - switch (charset) { - case cs_8859_1: - case cs_cp1252: - case cs_8859_15: - case cs_cp1251: - case cs_8859_5: - case cs_cp866: - case cs_koi8r: - replacement[0] = k; - replacement[1] = '\0'; - replacement_len = 1; - break; + code_l = strtol(*buf, buf, hexadecimal ? 16 : 10); - case cs_big5: - case cs_gb2312: - case cs_big5hkscs: - case cs_sjis: - case cs_eucjp: - /* we cannot properly handle those multibyte encodings - * with php_str_to_str. skip it. */ - continue; + if (**buf != ';') + return FAILURE; - case cs_utf_8: - replacement_len = php_utf32_utf8(replacement, k); - break; + /* many more are invalid, but that depends on whether it's HTML + * (and which version) or XML. Rejecting 0 is handy because that's + * the return of strtol if no character was read */ + if (code_l <= 0L || code_l > 0x10FFFFL) + return FAILURE; + + *code_point = (unsigned)code_l; - default: - php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!"); - efree(ret); - return NULL; - } + if (!all) { + if (*code_point != '\'' && *code_point != '"') + return FAILURE; + } - if (php_memnstr(ret, entity, entity_length, ret+retlen)) { - replaced = php_str_to_str(ret, retlen, entity, entity_length, replacement, replacement_len, &retlen); - efree(ret); - ret = replaced; - } - } - } + return SUCCESS; +} +/* }}} */ + +/* {{{ process_named_entity */ +static int process_named_entity(char **buf, unsigned *code_unit_seq, HashTable *inv_map) +{ + size_t length; + char *start = *buf; + unsigned *stored_code; + + /* "&" is represented by a 0x26 in all supported encodings. That means + * the byte after represents a character or is the leading byte of an + * sequence of 8-bit code units. If in the ranges below, it represents + * necessarily a alpha character because none of the supported encodings + * has an overlap with ASCII in the leading byte (only on the second one) */ + while (**buf >= 'a' && **buf <= 'z' || + **buf >= 'A' && **buf <= 'Z' || + **buf >= '0' && **buf <= '9') { + (*buf)++; } - for (j = 0; basic_entities[j].charcode != 0; j++) { + if (**buf != ';') + return FAILURE; + + /* cast to size_t OK as the quantity is always non-negative */ + length = *buf - start; + if (length == 0 || length > 31) /* 31 is arbitrary */ + return FAILURE; + + if (zend_hash_find(inv_map, start, (uint)length, (void**)&stored_code) == FAILURE) + return FAILURE; + + *code_unit_seq = *stored_code; + + return SUCCESS; +} +/* }}} */ + +/* {{{ traverse_for_entities + * Auxiliary function to php_unescape_html_entities(). + * - The argument "all" determines if all numeric entities are decode or only those + * that correspond to quotes (depending on quote_style). Typically used with the inv_map + * stored under the key 0 in BG(inverse_ent_maps). + * - Using cs_terminator as charset is legal and has the effect of defaulting to UTF-8. Used + * when the encoding doesn't (or shouldn't...) matter. + */ +static void traverse_for_entities(char *ret, int *retlen_p, int all, int quote_style, HashTable *inv_map, enum entity_charset charset) +{ + int retlen; + char *p, *q, *lim; + + /* note: this function assumes the entities always take equal or more space + * than the characters they represent in whatever supported external encoding. + * The supported encoding that can generate the longest code unit sequences is + * UTF-8 (4 bytes). Theoretically, there could be entities with only 3 chars + * (e.g. &z;) that would map to outside-the-BMP unicode code points and hence + * needed 4 bytes and would overflow, but we have no such thing. */ + + if (charset == cs_terminator) /* caller doesn't care; we choose one */ + charset = cs_utf_8; + + retlen = *retlen_p; - if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0) + lim = ret + retlen; /* terminator address */ + assert(*lim == '\0'); + + for (p = ret, q = ret; p < lim;) { + unsigned code; + char *next = NULL; + /* code is unicode code point or a set of 8-bit code units packed into + * an integer with the least significant bit being the last byte? */ + int unicode; + + /* Shift JIS, Big5 and HKSCS use multi-byte encodings where an + * ASCII range byte can be part of a multi-byte sequence. + * However, they start at 0x40, therefore if we find a 0x26 byte, + * we're sure it represents the '&' character. */ + + /* assumes there are no single-char entities */ + if (p[0] != '&' || (p + 3 >= lim)) { + *(q++) = *(p++); continue; + } + + /* now p[3] is surely valid and is no terminator */ + + /* numerical entity */ + if (p[1] == '#') { + next = &p[2]; + if (process_numeric_entity(&next, &code, all) == FAILURE) + goto invalid_code; + unicode = 1; + } else if (inv_map != NULL) { + next = &p[1]; + if (process_named_entity(&next, &code, inv_map) == FAILURE) + goto invalid_code; + unicode = 0; + } else { + goto invalid_code; + } - replacement[0] = (unsigned char)basic_entities[j].charcode; - replacement[1] = '\0'; + assert(*next == ';'); + + if (code == '\'' && !(quote_style & ENT_HTML_QUOTE_SINGLE) || + code == '"' && !(quote_style & ENT_HTML_QUOTE_DOUBLE)) + goto invalid_code; + + if (unicode && charset != cs_utf_8) { + /* replace unicode code point */ + if (map_from_unicode(code, charset, &code) == FAILURE) + goto invalid_code; /* not representable in target charset */ + } + + switch (charset) { + case cs_utf_8: + { + size_t written; + written = php_utf32_utf8((unsigned char*)q, code); + q += written; + /* Since we're writing in place, we hope we didn't write more than we read */ + assert(written <= (size_t)(next - p) + 1); + break; + } + + case cs_8859_1: + case cs_cp1252: + case cs_8859_15: + case cs_koi8r: + case cs_cp1251: + case cs_8859_5: + case cs_cp866: + case cs_macroman: + /* single byte stuff */ + *(q++) = code; + break; + + case cs_big5: + case cs_big5hkscs: + case cs_sjis: + case cs_gb2312: + /* one or two bytes */ + *(q++) = (code & 0xFFU); + if (0xFF00U & code) { /* 2 */ + *(q++) = (code >> 8); + } + break; + + case cs_eucjp: + /* one to three bytes */ + *(q++) = code & 0xFFU; + if (0xFFFF00U & code) { /* 2 */ + *(q++) = ((code >> 8) & 0xFFU); + if (0xFF0000U & code) /* 3 */ + *(q++) = (code >> 16); + } + break; + + default: + /* for backwards compatilibity */ + goto invalid_code; + break; + } + + /* jump over the valid entity; may go beyond size of buffer; np */ + p = next + 1; + continue; - if (php_memnstr(ret, basic_entities[j].entity, basic_entities[j].entitylen, ret+retlen)) { - replaced = php_str_to_str(ret, retlen, basic_entities[j].entity, basic_entities[j].entitylen, replacement, 1, &retlen); - efree(ret); - ret = replaced; +invalid_code: + for (; p < next; p++) { + *(q++) = *p; } } + + *q = '\0'; + *retlen_p = (size_t)(q - ret); +} +/* }}} */ - /* replace numeric entities & "&" */ - lim = ret + retlen; - for (p = ret, q = ret; p < lim;) { - int code; +/* {{{ inv_ent_maps_dtor + * Hash table destructor for BG(inverse_ent_maps) + */ +static void inv_ent_maps_dtor(HashTable **ht) { + zend_hash_destroy(*ht); + pefree(*ht, 1); +} +/* }}} */ - if (p[0] == '&') { - if (p + 2 < lim) { - if (p[1] == '#') { - int invalid_code = 0; +/* {{{ unescape_inverse_map + * Auxiliary function to php_unescape_html_entities() + * charset can be cs_terminator for only basic entities. + */ +static HashTable *unescape_inverse_map(enum entity_charset charset TSRMLS_DC) +{ + HashTable **inverse_map; - if (p[2] == 'x' || p[2] == 'X') { - code = strtol(p + 3, &next, 16); - } else { - code = strtol(p + 2, &next, 10); - } + /* we accept charset = cs_terminator (for specialchars) */ - if (code == '\'' && !(quote_style & ENT_HTML_QUOTE_SINGLE) || - code == '"' && !(quote_style & ENT_HTML_QUOTE_DOUBLE)) { - invalid_code = 1; - } + if (!BG(inverse_ent_maps)) { + BG(inverse_ent_maps) = pemalloc(sizeof *BG(inverse_ent_maps), 1); + zend_hash_init(BG(inverse_ent_maps), cs_numelems, NULL, (dtor_func_t)inv_ent_maps_dtor, 1); + } + if (zend_hash_index_find(BG(inverse_ent_maps), (ulong)charset, (void**)&inverse_map) == FAILURE) { + HashTable *ht = pemalloc(sizeof *ht, 1); + uint capacity = 0; + int j, t; - if (next != NULL && *next == ';' && !invalid_code) { - switch (charset) { - case cs_utf_8: - q += php_utf32_utf8(q, code); - break; - - case cs_8859_1: - case cs_8859_5: - case cs_8859_15: - if ((code >= 0x80 && code < 0xa0) || code > 0xff) { - invalid_code = 1; - } else { - *(q++) = code; - } - break; + /* determine upper bound for capacity of hashtable */ + for (j = 0; entity_map[j].charset != cs_terminator; j++) { + if (entity_map[j].charset == charset) + capacity += entity_map[j].endchar - entity_map[j].basechar + 1; + } - case cs_cp1252: - if (code > 0xff) { - invalid_code = 1; - } else { - *(q++) = code; - } - break; - - case cs_cp1251: - case cs_cp866: - case cs_big5: - case cs_big5hkscs: - case cs_sjis: - case cs_eucjp: - if (code >= 0x80) { - invalid_code = 1; - } else { - *(q++) = code; - } - break; + /* no destructor as we'll be storing ints */ + zend_hash_init(ht, capacity, NULL, NULL, 1); - case cs_gb2312: - if (code >= 0x81) { - invalid_code = 1; - } else { - *(q++) = code; - } - break; + /* store new hash table */ + t = zend_hash_index_update(BG(inverse_ent_maps), (ulong)charset, &ht, sizeof(ht), (void**)&inverse_map); + assert(t == SUCCESS); - default: - /* for backwards compatilibity */ - invalid_code = 1; - break; - } - if (invalid_code) { - for (; p <= next; p++) { - *(q++) = *p; - } - } - p = next + 1; - } else { - *(q++) = *(p++); - *(q++) = *(p++); - } - } else if (p + 4 < lim && - p[1] == 'a' && p[2] == 'm' &&p[3] == 'p' && - p[4] == ';') { - *(q++) = '&'; - p += 5; - } else { - *(q++) = *(p++); - *(q++) = *(p++); - } - } else { - *(q++) = *(p++); + /* build inverse map */ + for (j = 0; entity_map[j].charset != cs_terminator; j++) { + unsigned k; + + if (entity_map[j].charset != charset) + continue; + + for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) { + unsigned table_offset = k - entity_map[j].basechar; + const char* entity_name = entity_map[j].table[table_offset]; + + if (entity_name == NULL || *entity_name == '#') + continue; + + t = zend_hash_update(ht, entity_name, strlen(entity_name), &k, sizeof(k), NULL); + assert(t == SUCCESS); } - } else { - *(q++) = *(p++); + } + + /* and add the basic entitites */ + for (j = 0; basic_entities_ex[j].charcode != 0; j++) { + const basic_entity_t *ent = &basic_entities_ex[j]; + unsigned k = ent->charcode; + + t = zend_hash_update(ht, &ent->entity[1] /* skip & */, + ent->entitylen - 2 /* skip & and ; */, &k, sizeof(k), NULL); + assert(t == SUCCESS); } } - *q = '\0'; - retlen = (size_t)(q - ret); + + return *inverse_map; +} + +/* {{{ php_unescape_html_entities + * The parameter "all" should be true to decode all possible entities, false to decode + * only the basic ones, i.e., those in basic_entities_ex + the numeric entities + * that correspond to quotes. + */ +PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC) +{ + int retlen; + char *ret; + enum entity_charset charset; + HashTable *inverse_map = NULL; + + if (all) { + charset = determine_charset(hint_charset TSRMLS_CC); + } else { + charset = cs_terminator; + } + + ret = estrndup(old, oldlen); + retlen = oldlen; + if (retlen == 0) { + goto empty_source; + } + + /* charset == cs_terminator if !all */ + inverse_map = unescape_inverse_map(charset TSRMLS_CC); + + /* replace numeric entities */ + /* !all implies charset == cs_terminator && inverse_map == BG(inverse_ent_maps)[0] */ + traverse_for_entities(ret, &retlen, all, quote_style, inverse_map, charset); + empty_source: *newlen = retlen; return ret; @@ -1315,65 +1172,20 @@ PHP_FUNCTION(htmlspecialchars) Convert special HTML entities back to characters */ PHP_FUNCTION(htmlspecialchars_decode) { - char *str, *new_str, *e, *p; - int len, j, i, new_len; + char *str; + int str_len, len; long quote_style = ENT_COMPAT; - struct basic_entities_dec basic_entities_dec[8]; + char *replaced; - if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, "e_style) == FAILURE) { + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len, "e_style) == FAILURE) { return; } - new_str = estrndup(str, len); - new_len = len; - e = new_str + new_len; - - if (!(p = memchr(new_str, '&', new_len))) { - RETURN_STRINGL(new_str, new_len, 0); - } - - for (j = 0, i = 0; basic_entities[i].charcode != 0; i++) { - if (basic_entities[i].flags && !(quote_style & basic_entities[i].flags)) { - continue; - } - basic_entities_dec[j].charcode = basic_entities[i].charcode; - memcpy(basic_entities_dec[j].entity, basic_entities[i].entity, basic_entities[i].entitylen + 1); - basic_entities_dec[j].entitylen = basic_entities[i].entitylen; - j++; + replaced = php_unescape_html_entities(str, str_len, &len, 0 /*!all*/, quote_style, NULL TSRMLS_CC); + if (replaced) { + RETURN_STRINGL(replaced, len, 0); } - basic_entities_dec[j].charcode = '&'; - basic_entities_dec[j].entitylen = sizeof("&") - 1; - memcpy(basic_entities_dec[j].entity, "&", sizeof("&")); - i = j + 1; - - do { - int l = e - p; - - for (j = 0; j < i; j++) { - if (basic_entities_dec[j].entitylen > l) { - continue; - } - if (!memcmp(p, basic_entities_dec[j].entity, basic_entities_dec[j].entitylen)) { - int e_len = basic_entities_dec[j].entitylen - 1; - - *p++ = (char) basic_entities_dec[j].charcode; - memmove(p, p + e_len, (e - p - e_len)); - e -= e_len; - goto done; - } - } - p++; - -done: - if (p >= e) { - break; - } - } while ((p = memchr(p, '&', (e - p)))); - - new_len = e - new_str; - - new_str[new_len] = '\0'; - RETURN_STRINGL(new_str, new_len, 0); + RETURN_FALSE; } /* }}} */ @@ -1391,7 +1203,7 @@ PHP_FUNCTION(html_entity_decode) return; } - replaced = php_unescape_html_entities(str, str_len, &len, 1, quote_style, hint_charset TSRMLS_CC); + replaced = php_unescape_html_entities(str, str_len, &len, 1 /*all*/, quote_style, hint_charset TSRMLS_CC); if (replaced) { RETURN_STRINGL(replaced, len, 0); } @@ -1446,15 +1258,20 @@ PHP_FUNCTION(get_html_translation_table) /* break thru */ case HTML_SPECIALCHARS: - for (j = 0; basic_entities[j].charcode != 0; j++) { + for (j = 0; basic_entities_ex[j].charcode != 0; j++) { + void *dummy; - if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0) + if (basic_entities_ex[j].flags && (quote_style & basic_entities_ex[j].flags) == 0) continue; - ind[0] = (unsigned char)basic_entities[j].charcode; - add_assoc_stringl(return_value, ind, basic_entities[j].entity, basic_entities[j].entitylen, 1); + ind[0] = (unsigned char)basic_entities_ex[j].charcode; + if (zend_hash_find(Z_ARRVAL_P(return_value), ind, sizeof(ind), &dummy) == FAILURE) { + /* in case of the single quote, which is repeated, the first one wins, + * so don't replace the existint mapping */ + add_assoc_stringl(return_value, ind, basic_entities_ex[j].entity, + basic_entities_ex[j].entitylen, 1); + } } - add_assoc_stringl(return_value, "&", "&", sizeof("&") - 1, 1); break; } diff --git a/ext/standard/html_tables.h b/ext/standard/html_tables.h new file mode 100644 index 0000000000..d3a638b695 --- /dev/null +++ b/ext/standard/html_tables.h @@ -0,0 +1,2080 @@ +/* + +----------------------------------------------------------------------+ + | PHP Version 5 | + +----------------------------------------------------------------------+ + | Copyright (c) 1997-2010 The PHP Group | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | http://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Author: Rasmus Lerdorf <rasmus@lerdorf.on.ca> | + +----------------------------------------------------------------------+ +*/ + +/* $Id: html.h 293036 2010-01-03 09:23:27Z sebastian $ */ + +#ifndef HTML_TABLES_H +#define HTML_TABLES_H + +/* cs_terminator is overloaded in the following fashion: + * - It terminates the list entity maps. + * - In BG(inverse_ent_maps), it's the key of the inverse map that stores + * only the basic entities. + * - When passed to traverse_for_entities (or via php_unescape_entities with !all), + * we don't care about the encoding (UTF-8 is chosen, but it should be used + * when it doesn't matter). + */ +enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252, + cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, + cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r, + cs_cp1251, cs_8859_5, cs_cp866, cs_macroman, + cs_numelems /* used to count the number of charsets */ + }; +typedef const char *const entity_table_t; + +/* codepage 1252 is a Windows extension to iso-8859-1. */ +static entity_table_t ent_cp_1252[] = { + "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger", + "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig", + NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo", + "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo", + "oelig", NULL, NULL, "Yuml" +}; + +static entity_table_t ent_iso_8859_1[] = { + "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar", + "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg", + "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro", + "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14", + "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc", + "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", + "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", + "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", + "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", + "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", + "atilde", "auml", "aring", "aelig", "ccedil", "egrave", + "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", + "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", + "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", + "uuml", "yacute", "thorn", "yuml" +}; + +static entity_table_t ent_iso_8859_15[] = { + "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron", + "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg", + "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */ + "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm", + "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute", + "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", + "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", + "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", + "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", + "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", + "atilde", "auml", "aring", "aelig", "ccedil", "egrave", + "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", + "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", + "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", + "uuml", "yacute", "thorn", "yuml" +}; + +static entity_table_t ent_uni_338_402[] = { + /* 338 (0x0152) */ + "OElig", "oelig", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 352 (0x0160) */ + "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 376 (0x0178) */ + "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 400 (0x0190) */ + NULL, NULL, "fnof" +}; + +static entity_table_t ent_uni_spacing[] = { + /* 710 */ + "circ", + /* 711 - 730 */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 731 - 732 */ + NULL, "tilde" +}; + +static entity_table_t ent_uni_greek[] = { + /* 913 */ + "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta", + "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho", + NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega", + /* 938 - 944 are not mapped */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", + "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", + "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega", + /* 970 - 976 are not mapped */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "thetasym", "upsih", + NULL, NULL, NULL, + "piv" +}; + +static entity_table_t ent_uni_punct[] = { + /* 8194 */ + "ensp", "emsp", NULL, NULL, NULL, NULL, NULL, + "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm", + NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL, + /* 8216 */ + "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL, + "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL, + /* 8242 */ + "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL, + NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL, + "frasl" +}; + +static entity_table_t ent_uni_euro[] = { + "euro" +}; + +static entity_table_t ent_uni_8465_8501[] = { + /* 8465 */ + "image", NULL, NULL, NULL, NULL, NULL, NULL, + /* 8472 */ + "weierp", NULL, NULL, NULL, + /* 8476 */ + "real", NULL, NULL, NULL, NULL, NULL, + /* 8482 */ + "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8501 */ + "alefsym", +}; + +static entity_table_t ent_uni_8592_9002[] = { + /* 8592 (0x2190) */ + "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8608 (0x21a0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8624 (0x21b0) */ + NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8640 (0x21c0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8656 (0x21d0) */ + "lArr", "uArr", "rArr", "dArr", "hArr", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8672 (0x21e0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8704 (0x2200) */ + "forall", NULL, "part", "exist", NULL, "empty", NULL, "nabla", + "isin", "notin", NULL, "ni", NULL, NULL, NULL, "prod", + /* 8720 (0x2210) */ + NULL, "sum", "minus", NULL, NULL, NULL, NULL, "lowast", + NULL, NULL, "radic", NULL, NULL, "prop", "infin", NULL, + /* 8736 (0x2220) */ + "ang", NULL, NULL, NULL, NULL, NULL, NULL, "and", + "or", "cap", "cup", "int", NULL, NULL, NULL, NULL, + /* 8752 (0x2230) */ + NULL, NULL, NULL, NULL, "there4", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, "sim", NULL, NULL, NULL, + /* 8768 (0x2240) */ + NULL, NULL, NULL, NULL, NULL, "cong", NULL, NULL, + "asymp", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8784 (0x2250) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8800 (0x2260) */ + "ne", "equiv", NULL, NULL, "le", "ge", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8816 (0x2270) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8832 (0x2280) */ + NULL, NULL, "sub", "sup", "nsub", NULL, "sube", "supe", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8848 (0x2290) */ + NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8864 (0x22a0) */ + NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8880 (0x22b0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8896 (0x22c0) */ + NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8912 (0x22d0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8928 (0x22e0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8944 (0x22f0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8960 (0x2300) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL, + /* 8976 (0x2310) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8992 (0x2320) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, "lang", "rang" +}; + +static entity_table_t ent_uni_9674[] = { + /* 9674 */ + "loz" +}; + +static entity_table_t ent_uni_9824_9830[] = { + /* 9824 */ + "spades", NULL, NULL, "clubs", NULL, "hearts", "diams" +}; + +static entity_table_t ent_koi8r[] = { + "#1105", /* "jo "*/ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092", + "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084", + "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090", + "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096", + "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041", + "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048", + "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055", + "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042", + "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063", + "#1066" +}; + +static entity_table_t ent_cp_1251[] = { + "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger", + "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036", + "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220", + "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250", + "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118", + "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy", + "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn", + "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105", + "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111", + "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046", + "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053", + "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060", + "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067", + "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074", + "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081", + "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088", + "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095", + "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102", + "#1103" +}; + +static entity_table_t ent_iso_8859_5[] = { + "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062", + "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069", + "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076", + "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083", + "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090", + "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", + "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104", + "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111", + "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118", + "#1119" +}; + +static entity_table_t ent_cp_866[] = { + + "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566", + "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552", + "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560", + "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608", + "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090", + "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", + "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025", + "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118", + "#176", "#8729", "#183", "#8730", "#8470", "#164", "#9632", + "#160" +}; + +/* MacRoman has a couple of low-ascii chars that need mapping too */ +/* Vertical tab (ASCII 11) is often used to store line breaks inside */ +/* DB exports, this mapping changes it to a space */ +static entity_table_t ent_macroman[] = { + "sp", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, "quot", NULL, + NULL, NULL, "amp", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "lt", NULL, "gt", NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml", + "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring", + "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave", + "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml", + "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg", + "cent", "pound", "sect", "bull", "para", "szlig", "reg", + "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash", + "infin", "plusmn", "le", "ge", "yen", "micro", "part", + "sum", "prod", "pi", "int", "ordf", "ordm", "Omega", + "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof", + "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave", + "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo", + "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml", + "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger", + "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute", + "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute", + "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305", + "circ", "tilde", "macr", "#728", "#729", "#730", "cedil", + "#733", "#731", "#711" +}; + +struct html_entity_map { + enum entity_charset charset; /* charset identifier */ + unsigned int basechar; /* char code at start of table */ + unsigned int endchar; /* last char code in the table */ + entity_table_t *table; /* the table of mappings */ +}; + +static const struct html_entity_map entity_map[] = { + { cs_cp1252, 0x80, 0x9f, ent_cp_1252 }, + { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 }, + { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_utf_8, 338, 402, ent_uni_338_402 }, + { cs_utf_8, 710, 732, ent_uni_spacing }, + { cs_utf_8, 913, 982, ent_uni_greek }, + { cs_utf_8, 8194, 8260, ent_uni_punct }, + { cs_utf_8, 8364, 8364, ent_uni_euro }, + { cs_utf_8, 8465, 8501, ent_uni_8465_8501 }, + { cs_utf_8, 8592, 9002, ent_uni_8592_9002 }, + { cs_utf_8, 9674, 9674, ent_uni_9674 }, + { cs_utf_8, 9824, 9830, ent_uni_9824_9830 }, + { cs_big5, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_koi8r, 0xa3, 0xff, ent_koi8r }, + { cs_cp1251, 0x80, 0xff, ent_cp_1251 }, + { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 }, + { cs_cp866, 0xc0, 0xff, ent_cp_866 }, + { cs_macroman, 0x0b, 0xff, ent_macroman }, + { cs_terminator } +}; + +static const struct { + const char *codeset; + enum entity_charset charset; +} charset_map[] = { + { "ISO-8859-1", cs_8859_1 }, + { "ISO8859-1", cs_8859_1 }, + { "ISO-8859-15", cs_8859_15 }, + { "ISO8859-15", cs_8859_15 }, + { "utf-8", cs_utf_8 }, + { "cp1252", cs_cp1252 }, + { "Windows-1252", cs_cp1252 }, + { "1252", cs_cp1252 }, + { "BIG5", cs_big5 }, + { "950", cs_big5 }, + { "GB2312", cs_gb2312 }, + { "936", cs_gb2312 }, + { "BIG5-HKSCS", cs_big5hkscs }, + { "Shift_JIS", cs_sjis }, + { "SJIS", cs_sjis }, + { "932", cs_sjis }, + { "EUCJP", cs_eucjp }, + { "EUC-JP", cs_eucjp }, + { "KOI8-R", cs_koi8r }, + { "koi8-ru", cs_koi8r }, + { "koi8r", cs_koi8r }, + { "cp1251", cs_cp1251 }, + { "Windows-1251", cs_cp1251 }, + { "win-1251", cs_cp1251 }, + { "iso8859-5", cs_8859_5 }, + { "iso-8859-5", cs_8859_5 }, + { "cp866", cs_cp866 }, + { "866", cs_cp866 }, + { "ibm866", cs_cp866 }, + { "MacRoman", cs_macroman }, + { NULL } +}; + +typedef struct { + unsigned short charcode; + char *entity; + int entitylen; + int flags; +} basic_entity_t; + +static const basic_entity_t basic_entities_ex[] = { + { '&', "&", 5, 0 }, + { '"', """, 6, ENT_HTML_QUOTE_DOUBLE }, + /* PHP traditionally encodes ' as ', not ', so leave this entry here */ + { '\'', "'", 6, ENT_HTML_QUOTE_SINGLE }, + { '\'', "'", 6, ENT_HTML_QUOTE_SINGLE }, + { '<', "<", 4, 0 }, + { '>', ">", 4, 0 }, + { 0, NULL, 0, 0 } +}; + +/* In some cases, we need to give special treatment to &, so we + * use this instead */ +static const basic_entity_t *basic_entities = &basic_entities_ex[1]; + +typedef struct { + unsigned short un_code_point; /* we don't need bigger */ + unsigned char cs_code; /* currently, we only have maps to single-byte encodings */ +} unicode_mapping; + +static const unicode_mapping unimap_iso885915[] = { + { 0xA5, 0xA5 }, /* yen sign */ + { 0xA7, 0xA7 }, /* section sign */ + { 0xA9, 0xA9 }, /* copyright sign */ + { 0xAA, 0xAA }, /* feminine ordinal indicator */ + { 0xAB, 0xAB }, /* left-pointing double angle quotation mark */ + { 0xAC, 0xAC }, /* not sign */ + { 0xAD, 0xAD }, /* soft hyphen */ + { 0xAE, 0xAE }, /* registered sign */ + { 0xAF, 0xAF }, /* macron */ + { 0xB0, 0xB0 }, /* degree sign */ + { 0xB1, 0xB1 }, /* plus-minus sign */ + { 0xB2, 0xB2 }, /* superscript two */ + { 0xB3, 0xB3 }, /* superscript three */ + { 0xB5, 0xB5 }, /* micro sign */ + { 0xB6, 0xB6 }, /* pilcrow sign */ + { 0xB7, 0xB7 }, /* middle dot */ + { 0xB9, 0xB9 }, /* superscript one */ + { 0xBA, 0xBA }, /* masculine ordinal indicator */ + { 0xBB, 0xBB }, /* right-pointing double angle quotation mark */ + { 0x152, 0xBC }, /* latin capital ligature oe */ + { 0x153, 0xBD }, /* latin small ligature oe */ + { 0x160, 0xA6 }, /* latin capital letter s with caron */ + { 0x161, 0xA8 }, /* latin small letter s with caron */ + { 0x178, 0xBE }, /* latin capital letter y with diaeresis */ + { 0x17D, 0xB4 }, /* latin capital letter z with caron */ + { 0x17E, 0xB8 }, /* latin small letter z with caron */ + { 0x20AC, 0xA4 }, /* euro sign */ +}; + +static const unicode_mapping unimap_win1252[] = { + { 0x152, 0x8C }, /* latin capital ligature oe */ + { 0x153, 0x9C }, /* latin small ligature oe */ + { 0x160, 0x8A }, /* latin capital letter s with caron */ + { 0x161, 0x9A }, /* latin small letter s with caron */ + { 0x178, 0x9F }, /* latin capital letter y with diaeresis */ + { 0x17D, 0x8E }, /* latin capital letter z with caron */ + { 0x17E, 0x9E }, /* latin small letter z with caron */ + { 0x192, 0x83 }, /* latin small letter f with hook */ + { 0x2C6, 0x88 }, /* modifier letter circumflex accent */ + { 0x2DC, 0x98 }, /* small tilde */ + { 0x2013, 0x96 }, /* en dash */ + { 0x2014, 0x97 }, /* em dash */ + { 0x2018, 0x91 }, /* left single quotation mark */ + { 0x2019, 0x92 }, /* right single quotation mark */ + { 0x201A, 0x82 }, /* single low-9 quotation mark */ + { 0x201C, 0x93 }, /* left double quotation mark */ + { 0x201D, 0x94 }, /* right double quotation mark */ + { 0x201E, 0x84 }, /* double low-9 quotation mark */ + { 0x2020, 0x86 }, /* dagger */ + { 0x2021, 0x87 }, /* double dagger */ + { 0x2022, 0x95 }, /* bullet */ + { 0x2026, 0x85 }, /* horizontal ellipsis */ + { 0x2030, 0x89 }, /* per mille sign */ + { 0x2039, 0x8B }, /* single left-pointing angle quotation mark */ + { 0x203A, 0x9B }, /* single right-pointing angle quotation mark */ + { 0x20AC, 0x80 }, /* euro sign */ + { 0x2122, 0x99 }, /* trade mark sign */ +}; + +static const unicode_mapping unimap_win1251[] = { + { 0xA0, 0xA0 }, /* no-break space */ + { 0xA4, 0xA4 }, /* currency sign */ + { 0xA6, 0xA6 }, /* broken bar */ + { 0xA7, 0xA7 }, /* section sign */ + { 0xA9, 0xA9 }, /* copyright sign */ + { 0xAB, 0xAB }, /* left-pointing double angle quotation mark */ + { 0xAC, 0xAC }, /* not sign */ + { 0xAD, 0xAD }, /* soft hyphen */ + { 0xAE, 0xAE }, /* registered sign */ + { 0xB0, 0xB0 }, /* degree sign */ + { 0xB1, 0xB1 }, /* plus-minus sign */ + { 0xB5, 0xB5 }, /* micro sign */ + { 0xB6, 0xB6 }, /* pilcrow sign */ + { 0xB7, 0xB7 }, /* middle dot */ + { 0xBB, 0xBB }, /* right-pointing double angle quotation mark */ + { 0x401, 0xA8 }, /* cyrillic capital letter io */ + { 0x402, 0x80 }, /* cyrillic capital letter dje */ + { 0x403, 0x81 }, /* cyrillic capital letter gje */ + { 0x404, 0xAA }, /* cyrillic capital letter ukrainian ie */ + { 0x405, 0xBD }, /* cyrillic capital letter dze */ + { 0x406, 0xB2 }, /* cyrillic capital letter byelorussian-ukrainian i */ + { 0x407, 0xAF }, /* cyrillic capital letter yi */ + { 0x408, 0xA3 }, /* cyrillic capital letter je */ + { 0x409, 0x8A }, /* cyrillic capital letter lje */ + { 0x40A, 0x8C }, /* cyrillic capital letter nje */ + { 0x40B, 0x8E }, /* cyrillic capital letter tshe */ + { 0x40C, 0x8D }, /* cyrillic capital letter kje */ + { 0x40E, 0xA1 }, /* cyrillic capital letter short u */ + { 0x40F, 0x8F }, /* cyrillic capital letter dzhe */ + { 0x410, 0xC0 }, /* cyrillic capital letter a */ + { 0x411, 0xC1 }, /* cyrillic capital letter be */ + { 0x412, 0xC2 }, /* cyrillic capital letter ve */ + { 0x413, 0xC3 }, /* cyrillic capital letter ghe */ + { 0x414, 0xC4 }, /* cyrillic capital letter de */ + { 0x415, 0xC5 }, /* cyrillic capital letter ie */ + { 0x416, 0xC6 }, /* cyrillic capital letter zhe */ + { 0x417, 0xC7 }, /* cyrillic capital letter ze */ + { 0x418, 0xC8 }, /* cyrillic capital letter i */ + { 0x419, 0xC9 }, /* cyrillic capital letter short i */ + { 0x41A, 0xCA }, /* cyrillic capital letter ka */ + { 0x41B, 0xCB }, /* cyrillic capital letter el */ + { 0x41C, 0xCC }, /* cyrillic capital letter em */ + { 0x41D, 0xCD }, /* cyrillic capital letter en */ + { 0x41E, 0xCE }, /* cyrillic capital letter o */ + { 0x41F, 0xCF }, /* cyrillic capital letter pe */ + { 0x420, 0xD0 }, /* cyrillic capital letter er */ + { 0x421, 0xD1 }, /* cyrillic capital letter es */ + { 0x422, 0xD2 }, /* cyrillic capital letter te */ + { 0x423, 0xD3 }, /* cyrillic capital letter u */ + { 0x424, 0xD4 }, /* cyrillic capital letter ef */ + { 0x425, 0xD5 }, /* cyrillic capital letter ha */ + { 0x426, 0xD6 }, /* cyrillic capital letter tse */ + { 0x427, 0xD7 }, /* cyrillic capital letter che */ + { 0x428, 0xD8 }, /* cyrillic capital letter sha */ + { 0x429, 0xD9 }, /* cyrillic capital letter shcha */ + { 0x42A, 0xDA }, /* cyrillic capital letter hard sign */ + { 0x42B, 0xDB }, /* cyrillic capital letter yeru */ + { 0x42C, 0xDC }, /* cyrillic capital letter soft sign */ + { 0x42D, 0xDD }, /* cyrillic capital letter e */ + { 0x42E, 0xDE }, /* cyrillic capital letter yu */ + { 0x42F, 0xDF }, /* cyrillic capital letter ya */ + { 0x430, 0xE0 }, /* cyrillic small letter a */ + { 0x431, 0xE1 }, /* cyrillic small letter be */ + { 0x432, 0xE2 }, /* cyrillic small letter ve */ + { 0x433, 0xE3 }, /* cyrillic small letter ghe */ + { 0x434, 0xE4 }, /* cyrillic small letter de */ + { 0x435, 0xE5 }, /* cyrillic small letter ie */ + { 0x436, 0xE6 }, /* cyrillic small letter zhe */ + { 0x437, 0xE7 }, /* cyrillic small letter ze */ + { 0x438, 0xE8 }, /* cyrillic small letter i */ + { 0x439, 0xE9 }, /* cyrillic small letter short i */ + { 0x43A, 0xEA }, /* cyrillic small letter ka */ + { 0x43B, 0xEB }, /* cyrillic small letter el */ + { 0x43C, 0xEC }, /* cyrillic small letter em */ + { 0x43D, 0xED }, /* cyrillic small letter en */ + { 0x43E, 0xEE }, /* cyrillic small letter o */ + { 0x43F, 0xEF }, /* cyrillic small letter pe */ + { 0x440, 0xF0 }, /* cyrillic small letter er */ + { 0x441, 0xF1 }, /* cyrillic small letter es */ + { 0x442, 0xF2 }, /* cyrillic small letter te */ + { 0x443, 0xF3 }, /* cyrillic small letter u */ + { 0x444, 0xF4 }, /* cyrillic small letter ef */ + { 0x445, 0xF5 }, /* cyrillic small letter ha */ + { 0x446, 0xF6 }, /* cyrillic small letter tse */ + { 0x447, 0xF7 }, /* cyrillic small letter che */ + { 0x448, 0xF8 }, /* cyrillic small letter sha */ + { 0x449, 0xF9 }, /* cyrillic small letter shcha */ + { 0x44A, 0xFA }, /* cyrillic small letter hard sign */ + { 0x44B, 0xFB }, /* cyrillic small letter yeru */ + { 0x44C, 0xFC }, /* cyrillic small letter soft sign */ + { 0x44D, 0xFD }, /* cyrillic small letter e */ + { 0x44E, 0xFE }, /* cyrillic small letter yu */ + { 0x44F, 0xFF }, /* cyrillic small letter ya */ + { 0x451, 0xB8 }, /* cyrillic small letter io */ + { 0x452, 0x90 }, /* cyrillic small letter dje */ + { 0x453, 0x83 }, /* cyrillic small letter gje */ + { 0x454, 0xBA }, /* cyrillic small letter ukrainian ie */ + { 0x455, 0xBE }, /* cyrillic small letter dze */ + { 0x456, 0xB3 }, /* cyrillic small letter byelorussian-ukrainian i */ + { 0x457, 0xBF }, /* cyrillic small letter yi */ + { 0x458, 0xBC }, /* cyrillic small letter je */ + { 0x459, 0x9A }, /* cyrillic small letter lje */ + { 0x45A, 0x9C }, /* cyrillic small letter nje */ + { 0x45B, 0x9E }, /* cyrillic small letter tshe */ + { 0x45C, 0x9D }, /* cyrillic small letter kje */ + { 0x45E, 0xA2 }, /* cyrillic small letter short u */ + { 0x45F, 0x9F }, /* cyrillic small letter dzhe */ + { 0x490, 0xA5 }, /* cyrillic capital letter ghe with upturn */ + { 0x491, 0xB4 }, /* cyrillic small letter ghe with upturn */ + { 0x2013, 0x96 }, /* en dash */ + { 0x2014, 0x97 }, /* em dash */ + { 0x2018, 0x91 }, /* left single quotation mark */ + { 0x2019, 0x92 }, /* right single quotation mark */ + { 0x201A, 0x82 }, /* single low-9 quotation mark */ + { 0x201C, 0x93 }, /* left double quotation mark */ + { 0x201D, 0x94 }, /* right double quotation mark */ + { 0x201E, 0x84 }, /* double low-9 quotation mark */ + { 0x2020, 0x86 }, /* dagger */ + { 0x2021, 0x87 }, /* double dagger */ + { 0x2022, 0x95 }, /* bullet */ + { 0x2026, 0x85 }, /* horizontal ellipsis */ + { 0x2030, 0x89 }, /* per mille sign */ + { 0x2039, 0x8B }, /* single left-pointing angle quotation mark */ + { 0x203A, 0x9B }, /* single right-pointing angle quotation mark */ + { 0x20AC, 0x88 }, /* euro sign */ + { 0x2116, 0xB9 }, /* numero sign */ + { 0x2122, 0x99 }, /* trade mark sign */ +}; + +static const unicode_mapping unimap_koi8r[] = { + { 0xA0, 0x9A }, /* no-break space */ + { 0xA9, 0xBF }, /* copyright sign */ + { 0xB0, 0x9C }, /* degree sign */ + { 0xB2, 0x9D }, /* superscript two */ + { 0xB7, 0x9E }, /* middle dot */ + { 0xF7, 0x9F }, /* division sign */ + { 0x401, 0xB3 }, /* cyrillic capital letter io */ + { 0x410, 0xE1 }, /* cyrillic capital letter a */ + { 0x411, 0xE2 }, /* cyrillic capital letter be */ + { 0x412, 0xF7 }, /* cyrillic capital letter ve */ + { 0x413, 0xE7 }, /* cyrillic capital letter ghe */ + { 0x414, 0xE4 }, /* cyrillic capital letter de */ + { 0x415, 0xE5 }, /* cyrillic capital letter ie */ + { 0x416, 0xF6 }, /* cyrillic capital letter zhe */ + { 0x417, 0xFA }, /* cyrillic capital letter ze */ + { 0x418, 0xE9 }, /* cyrillic capital letter i */ + { 0x419, 0xEA }, /* cyrillic capital letter short i */ + { 0x41A, 0xEB }, /* cyrillic capital letter ka */ + { 0x41B, 0xEC }, /* cyrillic capital letter el */ + { 0x41C, 0xED }, /* cyrillic capital letter em */ + { 0x41D, 0xEE }, /* cyrillic capital letter en */ + { 0x41E, 0xEF }, /* cyrillic capital letter o */ + { 0x41F, 0xF0 }, /* cyrillic capital letter pe */ + { 0x420, 0xF2 }, /* cyrillic capital letter er */ + { 0x421, 0xF3 }, /* cyrillic capital letter es */ + { 0x422, 0xF4 }, /* cyrillic capital letter te */ + { 0x423, 0xF5 }, /* cyrillic capital letter u */ + { 0x424, 0xE6 }, /* cyrillic capital letter ef */ + { 0x425, 0xE8 }, /* cyrillic capital letter ha */ + { 0x426, 0xE3 }, /* cyrillic capital letter tse */ + { 0x427, 0xFE }, /* cyrillic capital letter che */ + { 0x428, 0xFB }, /* cyrillic capital letter sha */ + { 0x429, 0xFD }, /* cyrillic capital letter shcha */ + { 0x42A, 0xFF }, /* cyrillic capital letter hard sign */ + { 0x42B, 0xF9 }, /* cyrillic capital letter yeru */ + { 0x42C, 0xF8 }, /* cyrillic capital letter soft sign */ + { 0x42D, 0xFC }, /* cyrillic capital letter e */ + { 0x42E, 0xE0 }, /* cyrillic capital letter yu */ + { 0x42F, 0xF1 }, /* cyrillic capital letter ya */ + { 0x430, 0xC1 }, /* cyrillic small letter a */ + { 0x431, 0xC2 }, /* cyrillic small letter be */ + { 0x432, 0xD7 }, /* cyrillic small letter ve */ + { 0x433, 0xC7 }, /* cyrillic small letter ghe */ + { 0x434, 0xC4 }, /* cyrillic small letter de */ + { 0x435, 0xC5 }, /* cyrillic small letter ie */ + { 0x436, 0xD6 }, /* cyrillic small letter zhe */ + { 0x437, 0xDA }, /* cyrillic small letter ze */ + { 0x438, 0xC9 }, /* cyrillic small letter i */ + { 0x439, 0xCA }, /* cyrillic small letter short i */ + { 0x43A, 0xCB }, /* cyrillic small letter ka */ + { 0x43B, 0xCC }, /* cyrillic small letter el */ + { 0x43C, 0xCD }, /* cyrillic small letter em */ + { 0x43D, 0xCE }, /* cyrillic small letter en */ + { 0x43E, 0xCF }, /* cyrillic small letter o */ + { 0x43F, 0xD0 }, /* cyrillic small letter pe */ + { 0x440, 0xD2 }, /* cyrillic small letter er */ + { 0x441, 0xD3 }, /* cyrillic small letter es */ + { 0x442, 0xD4 }, /* cyrillic small letter te */ + { 0x443, 0xD5 }, /* cyrillic small letter u */ + { 0x444, 0xC6 }, /* cyrillic small letter ef */ + { 0x445, 0xC8 }, /* cyrillic small letter ha */ + { 0x446, 0xC3 }, /* cyrillic small letter tse */ + { 0x447, 0xDE }, /* cyrillic small letter che */ + { 0x448, 0xDB }, /* cyrillic small letter sha */ + { 0x449, 0xDD }, /* cyrillic small letter shcha */ + { 0x44A, 0xDF }, /* cyrillic small letter hard sign */ + { 0x44B, 0xD9 }, /* cyrillic small letter yeru */ + { 0x44C, 0xD8 }, /* cyrillic small letter soft sign */ + { 0x44D, 0xDC }, /* cyrillic small letter e */ + { 0x44E, 0xC0 }, /* cyrillic small letter yu */ + { 0x44F, 0xD1 }, /* cyrillic small letter ya */ + { 0x451, 0xA3 }, /* cyrillic small letter io */ + { 0x2219, 0x95 }, /* bullet operator */ + { 0x221A, 0x96 }, /* square root */ + { 0x2248, 0x97 }, /* almost equal to */ + { 0x2264, 0x98 }, /* less-than or equal to */ + { 0x2265, 0x99 }, /* greater-than or equal to */ + { 0x2320, 0x93 }, /* top half integral */ + { 0x2321, 0x9B }, /* bottom half integral */ + { 0x2500, 0x80 }, /* box drawings light horizontal */ + { 0x2502, 0x81 }, /* box drawings light vertical */ + { 0x250C, 0x82 }, /* box drawings light down and right */ + { 0x2510, 0x83 }, /* box drawings light down and left */ + { 0x2514, 0x84 }, /* box drawings light up and right */ + { 0x2518, 0x85 }, /* box drawings light up and left */ + { 0x251C, 0x86 }, /* box drawings light vertical and right */ + { 0x2524, 0x87 }, /* box drawings light vertical and left */ + { 0x252C, 0x88 }, /* box drawings light down and horizontal */ + { 0x2534, 0x89 }, /* box drawings light up and horizontal */ + { 0x253C, 0x8A }, /* box drawings light vertical and horizontal */ + { 0x2550, 0xA0 }, /* box drawings double horizontal */ + { 0x2551, 0xA1 }, /* box drawings double vertical */ + { 0x2552, 0xA2 }, /* box drawings down single and right double */ + { 0x2553, 0xA4 }, /* box drawings down double and right single */ + { 0x2554, 0xA5 }, /* box drawings double down and right */ + { 0x2555, 0xA6 }, /* box drawings down single and left double */ + { 0x2556, 0xA7 }, /* box drawings down double and left single */ + { 0x2557, 0xA8 }, /* box drawings double down and left */ + { 0x2558, 0xA9 }, /* box drawings up single and right double */ + { 0x2559, 0xAA }, /* box drawings up double and right single */ + { 0x255A, 0xAB }, /* box drawings double up and right */ + { 0x255B, 0xAC }, /* box drawings up single and left double */ + { 0x255C, 0xAD }, /* box drawings up double and left single */ + { 0x255D, 0xAE }, /* box drawings double up and left */ + { 0x255E, 0xAF }, /* box drawings vertical single and right double */ + { 0x255F, 0xB0 }, /* box drawings vertical double and right single */ + { 0x2560, 0xB1 }, /* box drawings double vertical and right */ + { 0x2561, 0xB2 }, /* box drawings vertical single and left double */ + { 0x2562, 0xB4 }, /* box drawings vertical double and left single */ + { 0x2563, 0xB5 }, /* box drawings double vertical and left */ + { 0x2564, 0xB6 }, /* box drawings down single and horizontal double */ + { 0x2565, 0xB7 }, /* box drawings down double and horizontal single */ + { 0x2566, 0xB8 }, /* box drawings double down and horizontal */ + { 0x2567, 0xB9 }, /* box drawings up single and horizontal double */ + { 0x2568, 0xBA }, /* box drawings up double and horizontal single */ + { 0x2569, 0xBB }, /* box drawings double up and horizontal */ + { 0x256A, 0xBC }, /* box drawings vertical single and horizontal double */ + { 0x256B, 0xBD }, /* box drawings vertical double and horizontal single */ + { 0x256C, 0xBE }, /* box drawings double vertical and horizontal */ + { 0x2580, 0x8B }, /* upper half block */ + { 0x2584, 0x8C }, /* lower half block */ + { 0x2588, 0x8D }, /* full block */ + { 0x258C, 0x8E }, /* left half block */ + { 0x2590, 0x8F }, /* right half block */ + { 0x2591, 0x90 }, /* light shade */ + { 0x2592, 0x91 }, /* medium shade */ + { 0x2593, 0x92 }, /* dark shade */ + { 0x25A0, 0x94 }, /* black square */ +}; + +static const unicode_mapping unimap_cp866[] = { + { 0xA0, 0xFF }, /* no-break space */ + { 0xA4, 0xFD }, /* currency sign */ + { 0xB0, 0xF8 }, /* degree sign */ + { 0xB7, 0xFA }, /* middle dot */ + { 0x401, 0xF0 }, /* cyrillic capital letter io */ + { 0x404, 0xF2 }, /* cyrillic capital letter ukrainian ie */ + { 0x407, 0xF4 }, /* cyrillic capital letter yi */ + { 0x40E, 0xF6 }, /* cyrillic capital letter short u */ + { 0x410, 0x80 }, /* cyrillic capital letter a */ + { 0x411, 0x81 }, /* cyrillic capital letter be */ + { 0x412, 0x82 }, /* cyrillic capital letter ve */ + { 0x413, 0x83 }, /* cyrillic capital letter ghe */ + { 0x414, 0x84 }, /* cyrillic capital letter de */ + { 0x415, 0x85 }, /* cyrillic capital letter ie */ + { 0x416, 0x86 }, /* cyrillic capital letter zhe */ + { 0x417, 0x87 }, /* cyrillic capital letter ze */ + { 0x418, 0x88 }, /* cyrillic capital letter i */ + { 0x419, 0x89 }, /* cyrillic capital letter short i */ + { 0x41A, 0x8A }, /* cyrillic capital letter ka */ + { 0x41B, 0x8B }, /* cyrillic capital letter el */ + { 0x41C, 0x8C }, /* cyrillic capital letter em */ + { 0x41D, 0x8D }, /* cyrillic capital letter en */ + { 0x41E, 0x8E }, /* cyrillic capital letter o */ + { 0x41F, 0x8F }, /* cyrillic capital letter pe */ + { 0x420, 0x90 }, /* cyrillic capital letter er */ + { 0x421, 0x91 }, /* cyrillic capital letter es */ + { 0x422, 0x92 }, /* cyrillic capital letter te */ + { 0x423, 0x93 }, /* cyrillic capital letter u */ + { 0x424, 0x94 }, /* cyrillic capital letter ef */ + { 0x425, 0x95 }, /* cyrillic capital letter ha */ + { 0x426, 0x96 }, /* cyrillic capital letter tse */ + { 0x427, 0x97 }, /* cyrillic capital letter che */ + { 0x428, 0x98 }, /* cyrillic capital letter sha */ + { 0x429, 0x99 }, /* cyrillic capital letter shcha */ + { 0x42A, 0x9A }, /* cyrillic capital letter hard sign */ + { 0x42B, 0x9B }, /* cyrillic capital letter yeru */ + { 0x42C, 0x9C }, /* cyrillic capital letter soft sign */ + { 0x42D, 0x9D }, /* cyrillic capital letter e */ + { 0x42E, 0x9F }, /* cyrillic capital letter ya */ + { 0x430, 0xA0 }, /* cyrillic small letter a */ + { 0x431, 0xA1 }, /* cyrillic small letter be */ + { 0x432, 0xA2 }, /* cyrillic small letter ve */ + { 0x433, 0xA3 }, /* cyrillic small letter ghe */ + { 0x434, 0xA4 }, /* cyrillic small letter de */ + { 0x435, 0xA5 }, /* cyrillic small letter ie */ + { 0x436, 0xA6 }, /* cyrillic small letter zhe */ + { 0x437, 0xA7 }, /* cyrillic small letter ze */ + { 0x438, 0xA8 }, /* cyrillic small letter i */ + { 0x439, 0xA9 }, /* cyrillic small letter short i */ + { 0x43A, 0xAA }, /* cyrillic small letter ka */ + { 0x43B, 0xAB }, /* cyrillic small letter el */ + { 0x43C, 0xAC }, /* cyrillic small letter em */ + { 0x43D, 0xAD }, /* cyrillic small letter en */ + { 0x43E, 0xAE }, /* cyrillic small letter o */ + { 0x43F, 0xAF }, /* cyrillic small letter pe */ + { 0x440, 0xE0 }, /* cyrillic small letter er */ + { 0x441, 0xE1 }, /* cyrillic small letter es */ + { 0x442, 0xE2 }, /* cyrillic small letter te */ + { 0x443, 0xE3 }, /* cyrillic small letter u */ + { 0x444, 0xE4 }, /* cyrillic small letter ef */ + { 0x445, 0xE5 }, /* cyrillic small letter ha */ + { 0x446, 0xE6 }, /* cyrillic small letter tse */ + { 0x447, 0xE7 }, /* cyrillic small letter che */ + { 0x448, 0xE8 }, /* cyrillic small letter sha */ + { 0x449, 0xE9 }, /* cyrillic small letter shcha */ + { 0x44A, 0xEA }, /* cyrillic small letter hard sign */ + { 0x44B, 0xEB }, /* cyrillic small letter yeru */ + { 0x44C, 0xEC }, /* cyrillic small letter soft sign */ + { 0x44D, 0xED }, /* cyrillic small letter e */ + { 0x44E, 0xEE }, /* cyrillic small letter yu */ + { 0x44F, 0xEF }, /* cyrillic small letter ya */ + { 0x451, 0xF1 }, /* cyrillic small letter io */ + { 0x454, 0xF3 }, /* cyrillic small letter ukrainian ie */ + { 0x457, 0xF5 }, /* cyrillic small letter yi */ + { 0x45E, 0xF7 }, /* cyrillic small letter short u */ + { 0x2116, 0xFC }, /* numero sign */ + { 0x2219, 0xF9 }, /* bullet operator */ + { 0x221A, 0xFB }, /* square root */ + { 0x2500, 0xC4 }, /* box drawings light horizontal */ + { 0x2502, 0xB3 }, /* box drawings light vertical */ + { 0x250C, 0xDA }, /* box drawings light down and right */ + { 0x2510, 0xBF }, /* box drawings light down and left */ + { 0x2514, 0xC0 }, /* box drawings light up and right */ + { 0x2518, 0xD9 }, /* box drawings light up and left */ + { 0x251C, 0xC3 }, /* box drawings light vertical and right */ + { 0x2524, 0xB4 }, /* box drawings light vertical and left */ + { 0x252C, 0xC2 }, /* box drawings light down and horizontal */ + { 0x2534, 0xC1 }, /* box drawings light up and horizontal */ + { 0x253C, 0xC5 }, /* box drawings light vertical and horizontal */ + { 0x2550, 0xCD }, /* box drawings double horizontal */ + { 0x2551, 0xBA }, /* box drawings double vertical */ + { 0x2552, 0xD5 }, /* box drawings down single and right double */ + { 0x2553, 0xD6 }, /* box drawings down double and right single */ + { 0x2554, 0xC9 }, /* box drawings double down and right */ + { 0x2555, 0xB8 }, /* box drawings down single and left double */ + { 0x2556, 0xB7 }, /* box drawings down double and left single */ + { 0x2557, 0xBB }, /* box drawings double down and left */ + { 0x2558, 0xD4 }, /* box drawings up single and right double */ + { 0x2559, 0xD3 }, /* box drawings up double and right single */ + { 0x255A, 0xC8 }, /* box drawings double up and right */ + { 0x255B, 0xBE }, /* box drawings up single and left double */ + { 0x255C, 0xBD }, /* box drawings up double and left single */ + { 0x255D, 0xBC }, /* box drawings double up and left */ + { 0x255E, 0xC6 }, /* box drawings vertical single and right double */ + { 0x255F, 0xC7 }, /* box drawings vertical double and right single */ + { 0x2560, 0xCC }, /* box drawings double vertical and right */ + { 0x2561, 0xB5 }, /* box drawings vertical single and left double */ + { 0x2562, 0xB6 }, /* box drawings vertical double and left single */ + { 0x2563, 0xB9 }, /* box drawings double vertical and left */ + { 0x2564, 0xD1 }, /* box drawings down single and horizontal double */ + { 0x2565, 0xD2 }, /* box drawings down double and horizontal single */ + { 0x2566, 0xCB }, /* box drawings double down and horizontal */ + { 0x2567, 0xCF }, /* box drawings up single and horizontal double */ + { 0x2568, 0xD0 }, /* box drawings up double and horizontal single */ + { 0x2569, 0xCA }, /* box drawings double up and horizontal */ + { 0x256A, 0xD8 }, /* box drawings vertical single and horizontal double */ + { 0x256B, 0xD7 }, /* box drawings vertical double and horizontal single */ + { 0x256C, 0xCE }, /* box drawings double vertical and horizontal */ + { 0x2580, 0xDF }, /* upper half block */ + { 0x2584, 0xDC }, /* lower half block */ + { 0x2588, 0xDB }, /* full block */ + { 0x258C, 0xDD }, /* left half block */ + { 0x2590, 0xDE }, /* right half block */ + { 0x2591, 0xB0 }, /* light shade */ + { 0x2592, 0xB1 }, /* medium shade */ + { 0x2593, 0xB2 }, /* dark shade */ + { 0x25A0, 0xFE }, /* black square */ +}; + +static const unicode_mapping unimap_macroman[] = { + { 0xA0, 0xCA }, /* no-break space */ + { 0xA1, 0xC1 }, /* inverted exclamation mark */ + { 0xA2, 0xA2 }, /* cent sign */ + { 0xA3, 0xA3 }, /* pound sign */ + { 0xA5, 0xB4 }, /* yen sign */ + { 0xA7, 0xA4 }, /* section sign */ + { 0xA8, 0xAC }, /* diaeresis */ + { 0xA9, 0xA9 }, /* copyright sign */ + { 0xAA, 0xBB }, /* feminine ordinal indicator */ + { 0xAB, 0xC7 }, /* left-pointing double angle quotation mark */ + { 0xAC, 0xC2 }, /* not sign */ + { 0xAE, 0xA8 }, /* registered sign */ + { 0xAF, 0xF8 }, /* macron */ + { 0xB0, 0xA1 }, /* degree sign */ + { 0xB1, 0xB1 }, /* plus-minus sign */ + { 0xB4, 0xAB }, /* acute accent */ + { 0xB5, 0xB5 }, /* micro sign */ + { 0xB6, 0xA6 }, /* pilcrow sign */ + { 0xB7, 0xE1 }, /* middle dot */ + { 0xB8, 0xFC }, /* cedilla */ + { 0xBA, 0xBC }, /* masculine ordinal indicator */ + { 0xBB, 0xC8 }, /* right-pointing double angle quotation mark */ + { 0xBF, 0xC0 }, /* inverted question mark */ + { 0xC0, 0xCB }, /* latin capital letter a with grave */ + { 0xC1, 0xE7 }, /* latin capital letter a with acute */ + { 0xC2, 0xE5 }, /* latin capital letter a with circumflex */ + { 0xC3, 0xCC }, /* latin capital letter a with tilde */ + { 0xC4, 0x80 }, /* latin capital letter a with diaeresis */ + { 0xC5, 0x81 }, /* latin capital letter a with ring above */ + { 0xC6, 0xAE }, /* latin capital letter ae */ + { 0xC7, 0x82 }, /* latin capital letter c with cedilla */ + { 0xC8, 0xE9 }, /* latin capital letter e with grave */ + { 0xC9, 0x83 }, /* latin capital letter e with acute */ + { 0xCA, 0xE6 }, /* latin capital letter e with circumflex */ + { 0xCB, 0xE8 }, /* latin capital letter e with diaeresis */ + { 0xCC, 0xED }, /* latin capital letter i with grave */ + { 0xCD, 0xEA }, /* latin capital letter i with acute */ + { 0xCE, 0xEB }, /* latin capital letter i with circumflex */ + { 0xCF, 0xEC }, /* latin capital letter i with diaeresis */ + { 0xD1, 0x84 }, /* latin capital letter n with tilde */ + { 0xD2, 0xF1 }, /* latin capital letter o with grave */ + { 0xD3, 0xEE }, /* latin capital letter o with acute */ + { 0xD4, 0xEF }, /* latin capital letter o with circumflex */ + { 0xD5, 0xCD }, /* latin capital letter o with tilde */ + { 0xD6, 0x85 }, /* latin capital letter o with diaeresis */ + { 0xD8, 0xAF }, /* latin capital letter o with stroke */ + { 0xD9, 0xF4 }, /* latin capital letter u with grave */ + { 0xDA, 0xF2 }, /* latin capital letter u with acute */ + { 0xDB, 0xF3 }, /* latin capital letter u with circumflex */ + { 0xDC, 0x86 }, /* latin capital letter u with diaeresis */ + { 0xDF, 0xA7 }, /* latin small letter sharp s */ + { 0xE0, 0x88 }, /* latin small letter a with grave */ + { 0xE1, 0x87 }, /* latin small letter a with acute */ + { 0xE2, 0x89 }, /* latin small letter a with circumflex */ + { 0xE3, 0x8B }, /* latin small letter a with tilde */ + { 0xE4, 0x8A }, /* latin small letter a with diaeresis */ + { 0xE5, 0x8C }, /* latin small letter a with ring above */ + { 0xE6, 0xBE }, /* latin small letter ae */ + { 0xE7, 0x8D }, /* latin small letter c with cedilla */ + { 0xE8, 0x8F }, /* latin small letter e with grave */ + { 0xE9, 0x8E }, /* latin small letter e with acute */ + { 0xEA, 0x90 }, /* latin small letter e with circumflex */ + { 0xEB, 0x91 }, /* latin small letter e with diaeresis */ + { 0xEC, 0x93 }, /* latin small letter i with grave */ + { 0xED, 0x92 }, /* latin small letter i with acute */ + { 0xEE, 0x94 }, /* latin small letter i with circumflex */ + { 0xEF, 0x95 }, /* latin small letter i with diaeresis */ + { 0xF1, 0x96 }, /* latin small letter n with tilde */ + { 0xF2, 0x98 }, /* latin small letter o with grave */ + { 0xF3, 0x97 }, /* latin small letter o with acute */ + { 0xF4, 0x99 }, /* latin small letter o with circumflex */ + { 0xF5, 0x9B }, /* latin small letter o with tilde */ + { 0xF6, 0x9A }, /* latin small letter o with diaeresis */ + { 0xF7, 0xD6 }, /* division sign */ + { 0xF8, 0xBF }, /* latin small letter o with stroke */ + { 0xF9, 0x9D }, /* latin small letter u with grave */ + { 0xFA, 0x9C }, /* latin small letter u with acute */ + { 0xFB, 0x9E }, /* latin small letter u with circumflex */ + { 0xFC, 0x9F }, /* latin small letter u with diaeresis */ + { 0xFF, 0xD8 }, /* latin small letter y with diaeresis */ + { 0x131, 0xF5 }, /* latin small letter dotless i */ + { 0x152, 0xCE }, /* latin capital ligature oe */ + { 0x153, 0xCF }, /* latin small ligature oe */ + { 0x178, 0xD9 }, /* latin capital letter y with diaeresis */ + { 0x192, 0xC4 }, /* latin small letter f with hook */ + { 0x2C6, 0xF6 }, /* modifier letter circumflex accent */ + { 0x2C7, 0xFF }, /* caron */ + { 0x2D8, 0xF9 }, /* breve */ + { 0x2D9, 0xFA }, /* dot above */ + { 0x2DA, 0xFB }, /* ring above */ + { 0x2DB, 0xFE }, /* ogonek */ + { 0x2DC, 0xF7 }, /* small tilde */ + { 0x2DD, 0xFD }, /* double acute accent */ + { 0x3A9, 0xBD }, /* greek capital letter omega */ + { 0x3C0, 0xB9 }, /* greek small letter pi */ + { 0x2013, 0xD0 }, /* en dash */ + { 0x2014, 0xD1 }, /* em dash */ + { 0x2018, 0xD4 }, /* left single quotation mark */ + { 0x2019, 0xD5 }, /* right single quotation mark */ + { 0x201A, 0xE2 }, /* single low-9 quotation mark */ + { 0x201C, 0xD2 }, /* left double quotation mark */ + { 0x201D, 0xD3 }, /* right double quotation mark */ + { 0x201E, 0xE3 }, /* double low-9 quotation mark */ + { 0x2020, 0xA0 }, /* dagger */ + { 0x2021, 0xE0 }, /* double dagger */ + { 0x2022, 0xA5 }, /* bullet */ + { 0x2026, 0xC9 }, /* horizontal ellipsis */ + { 0x2030, 0xE4 }, /* per mille sign */ + { 0x2039, 0xDC }, /* single left-pointing angle quotation mark */ + { 0x203A, 0xDD }, /* single right-pointing angle quotation mark */ + { 0x2044, 0xDA }, /* fraction slash */ + { 0x20AC, 0xDB }, /* euro sign */ + { 0x2122, 0xAA }, /* trade mark sign */ + { 0x2202, 0xB6 }, /* partial differential */ + { 0x2206, 0xC6 }, /* increment */ + { 0x220F, 0xB8 }, /* n-ary product */ + { 0x2211, 0xB7 }, /* n-ary summation */ + { 0x221A, 0xC3 }, /* square root */ + { 0x221E, 0xB0 }, /* infinity */ + { 0x222B, 0xBA }, /* integral */ + { 0x2248, 0xC5 }, /* almost equal to */ + { 0x2260, 0xAD }, /* not equal to */ + { 0x2264, 0xB2 }, /* less-than or equal to */ + { 0x2265, 0xB3 }, /* greater-than or equal to */ + { 0x25CA, 0xD7 }, /* lozenge */ + { 0xF8FF, 0xF0 }, /* apple logo */ + { 0xFB01, 0xDE }, /* latin small ligature fi */ + { 0xFB02, 0xDF }, /* latin small ligature fl */ +}; + +#endif /* HTML_TABLES_H */ +/* + +----------------------------------------------------------------------+ + | PHP Version 5 | + +----------------------------------------------------------------------+ + | Copyright (c) 1997-2010 The PHP Group | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | http://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Author: Rasmus Lerdorf <rasmus@lerdorf.on.ca> | + +----------------------------------------------------------------------+ +*/ + +/* $Id: html.h 293036 2010-01-03 09:23:27Z sebastian $ */ + +#ifndef HTML_TABLES_H +#define HTML_TABLES_H + +/* cs_terminator is overloaded in the following fashion: + * - It terminates the list entity maps. + * - In BG(inverse_ent_maps), it's the key of the inverse map that stores + * only the basic entities. + * - When passed to traverse_for_entities (or via php_unescape_entities with !all), + * we don't care about the encoding (UTF-8 is chosen, but it should be used + * when it doesn't matter). + */ +enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252, + cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, + cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r, + cs_cp1251, cs_8859_5, cs_cp866, cs_macroman, + cs_numelems /* used to count the number of charsets */ + }; +typedef const char *const entity_table_t; + +/* codepage 1252 is a Windows extension to iso-8859-1. */ +static entity_table_t ent_cp_1252[] = { + "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger", + "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig", + NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo", + "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo", + "oelig", NULL, NULL, "Yuml" +}; + +static entity_table_t ent_iso_8859_1[] = { + "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar", + "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg", + "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro", + "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14", + "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc", + "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", + "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", + "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", + "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", + "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", + "atilde", "auml", "aring", "aelig", "ccedil", "egrave", + "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", + "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", + "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", + "uuml", "yacute", "thorn", "yuml" +}; + +static entity_table_t ent_iso_8859_15[] = { + "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron", + "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg", + "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */ + "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm", + "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute", + "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", + "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", + "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", + "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", + "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", + "atilde", "auml", "aring", "aelig", "ccedil", "egrave", + "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", + "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", + "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", + "uuml", "yacute", "thorn", "yuml" +}; + +static entity_table_t ent_uni_338_402[] = { + /* 338 (0x0152) */ + "OElig", "oelig", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 352 (0x0160) */ + "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 376 (0x0178) */ + "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 400 (0x0190) */ + NULL, NULL, "fnof" +}; + +static entity_table_t ent_uni_spacing[] = { + /* 710 */ + "circ", + /* 711 - 730 */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 731 - 732 */ + NULL, "tilde" +}; + +static entity_table_t ent_uni_greek[] = { + /* 913 */ + "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta", + "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho", + NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega", + /* 938 - 944 are not mapped */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", + "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", + "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega", + /* 970 - 976 are not mapped */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "thetasym", "upsih", + NULL, NULL, NULL, + "piv" +}; + +static entity_table_t ent_uni_punct[] = { + /* 8194 */ + "ensp", "emsp", NULL, NULL, NULL, NULL, NULL, + "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm", + NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL, + /* 8216 */ + "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL, + "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL, + /* 8242 */ + "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL, + NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL, + "frasl" +}; + +static entity_table_t ent_uni_euro[] = { + "euro" +}; + +static entity_table_t ent_uni_8465_8501[] = { + /* 8465 */ + "image", NULL, NULL, NULL, NULL, NULL, NULL, + /* 8472 */ + "weierp", NULL, NULL, NULL, + /* 8476 */ + "real", NULL, NULL, NULL, NULL, NULL, + /* 8482 */ + "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8501 */ + "alefsym", +}; + +static entity_table_t ent_uni_8592_9002[] = { + /* 8592 (0x2190) */ + "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8608 (0x21a0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8624 (0x21b0) */ + NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8640 (0x21c0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8656 (0x21d0) */ + "lArr", "uArr", "rArr", "dArr", "hArr", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8672 (0x21e0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8704 (0x2200) */ + "forall", NULL, "part", "exist", NULL, "empty", NULL, "nabla", + "isin", "notin", NULL, "ni", NULL, NULL, NULL, "prod", + /* 8720 (0x2210) */ + NULL, "sum", "minus", NULL, NULL, NULL, NULL, "lowast", + NULL, NULL, "radic", NULL, NULL, "prop", "infin", NULL, + /* 8736 (0x2220) */ + "ang", NULL, NULL, NULL, NULL, NULL, NULL, "and", + "or", "cap", "cup", "int", NULL, NULL, NULL, NULL, + /* 8752 (0x2230) */ + NULL, NULL, NULL, NULL, "there4", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, "sim", NULL, NULL, NULL, + /* 8768 (0x2240) */ + NULL, NULL, NULL, NULL, NULL, "cong", NULL, NULL, + "asymp", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8784 (0x2250) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8800 (0x2260) */ + "ne", "equiv", NULL, NULL, "le", "ge", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8816 (0x2270) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8832 (0x2280) */ + NULL, NULL, "sub", "sup", "nsub", NULL, "sube", "supe", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8848 (0x2290) */ + NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8864 (0x22a0) */ + NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8880 (0x22b0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8896 (0x22c0) */ + NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8912 (0x22d0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8928 (0x22e0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8944 (0x22f0) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8960 (0x2300) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL, + /* 8976 (0x2310) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* 8992 (0x2320) */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, "lang", "rang" +}; + +static entity_table_t ent_uni_9674[] = { + /* 9674 */ + "loz" +}; + +static entity_table_t ent_uni_9824_9830[] = { + /* 9824 */ + "spades", NULL, NULL, "clubs", NULL, "hearts", "diams" +}; + +static entity_table_t ent_koi8r[] = { + "#1105", /* "jo "*/ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092", + "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084", + "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090", + "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096", + "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041", + "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048", + "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055", + "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042", + "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063", + "#1066" +}; + +static entity_table_t ent_cp_1251[] = { + "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger", + "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036", + "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220", + "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250", + "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118", + "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy", + "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn", + "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105", + "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111", + "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046", + "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053", + "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060", + "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067", + "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074", + "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081", + "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088", + "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095", + "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102", + "#1103" +}; + +static entity_table_t ent_iso_8859_5[] = { + "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062", + "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069", + "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076", + "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083", + "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090", + "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", + "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104", + "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111", + "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118", + "#1119" +}; + +static entity_table_t ent_cp_866[] = { + + "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566", + "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552", + "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560", + "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608", + "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090", + "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", + "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025", + "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118", + "#176", "#8729", "#183", "#8730", "#8470", "#164", "#9632", + "#160" +}; + +/* MacRoman has a couple of low-ascii chars that need mapping too */ +/* Vertical tab (ASCII 11) is often used to store line breaks inside */ +/* DB exports, this mapping changes it to a space */ +static entity_table_t ent_macroman[] = { + "sp", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, "quot", NULL, + NULL, NULL, "amp", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "lt", NULL, "gt", NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml", + "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring", + "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave", + "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml", + "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg", + "cent", "pound", "sect", "bull", "para", "szlig", "reg", + "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash", + "infin", "plusmn", "le", "ge", "yen", "micro", "part", + "sum", "prod", "pi", "int", "ordf", "ordm", "Omega", + "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof", + "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave", + "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo", + "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml", + "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger", + "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute", + "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute", + "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305", + "circ", "tilde", "macr", "#728", "#729", "#730", "cedil", + "#733", "#731", "#711" +}; + +struct html_entity_map { + enum entity_charset charset; /* charset identifier */ + unsigned int basechar; /* char code at start of table */ + unsigned int endchar; /* last char code in the table */ + entity_table_t *table; /* the table of mappings */ +}; + +static const struct html_entity_map entity_map[] = { + { cs_cp1252, 0x80, 0x9f, ent_cp_1252 }, + { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 }, + { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_utf_8, 338, 402, ent_uni_338_402 }, + { cs_utf_8, 710, 732, ent_uni_spacing }, + { cs_utf_8, 913, 982, ent_uni_greek }, + { cs_utf_8, 8194, 8260, ent_uni_punct }, + { cs_utf_8, 8364, 8364, ent_uni_euro }, + { cs_utf_8, 8465, 8501, ent_uni_8465_8501 }, + { cs_utf_8, 8592, 9002, ent_uni_8592_9002 }, + { cs_utf_8, 9674, 9674, ent_uni_9674 }, + { cs_utf_8, 9824, 9830, ent_uni_9824_9830 }, + { cs_big5, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_koi8r, 0xa3, 0xff, ent_koi8r }, + { cs_cp1251, 0x80, 0xff, ent_cp_1251 }, + { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 }, + { cs_cp866, 0xc0, 0xff, ent_cp_866 }, + { cs_macroman, 0x0b, 0xff, ent_macroman }, + { cs_terminator } +}; + +static const struct { + const char *codeset; + enum entity_charset charset; +} charset_map[] = { + { "ISO-8859-1", cs_8859_1 }, + { "ISO8859-1", cs_8859_1 }, + { "ISO-8859-15", cs_8859_15 }, + { "ISO8859-15", cs_8859_15 }, + { "utf-8", cs_utf_8 }, + { "cp1252", cs_cp1252 }, + { "Windows-1252", cs_cp1252 }, + { "1252", cs_cp1252 }, + { "BIG5", cs_big5 }, + { "950", cs_big5 }, + { "GB2312", cs_gb2312 }, + { "936", cs_gb2312 }, + { "BIG5-HKSCS", cs_big5hkscs }, + { "Shift_JIS", cs_sjis }, + { "SJIS", cs_sjis }, + { "932", cs_sjis }, + { "EUCJP", cs_eucjp }, + { "EUC-JP", cs_eucjp }, + { "KOI8-R", cs_koi8r }, + { "koi8-ru", cs_koi8r }, + { "koi8r", cs_koi8r }, + { "cp1251", cs_cp1251 }, + { "Windows-1251", cs_cp1251 }, + { "win-1251", cs_cp1251 }, + { "iso8859-5", cs_8859_5 }, + { "iso-8859-5", cs_8859_5 }, + { "cp866", cs_cp866 }, + { "866", cs_cp866 }, + { "ibm866", cs_cp866 }, + { "MacRoman", cs_macroman }, + { NULL } +}; + +typedef struct { + unsigned short charcode; + char *entity; + int entitylen; + int flags; +} basic_entity_t; + +static const basic_entity_t basic_entities_ex[] = { + { '&', "&", 5, 0 }, + { '"', """, 6, ENT_HTML_QUOTE_DOUBLE }, + /* PHP traditionally encodes ' as ', not ', so leave this entry here */ + { '\'', "'", 6, ENT_HTML_QUOTE_SINGLE }, + { '\'', "'", 6, ENT_HTML_QUOTE_SINGLE }, + { '<', "<", 4, 0 }, + { '>', ">", 4, 0 }, + { 0, NULL, 0, 0 } +}; + +/* In some cases, we need to give special treatment to &, so we + * use this instead */ +static const basic_entity_t *basic_entities = &basic_entities_ex[1]; + +typedef struct { + unsigned short un_code_point; /* we don't need bigger */ + unsigned char cs_code; /* currently, we only have maps to single-byte encodings */ +} unicode_mapping; + +static const unicode_mapping unimap_iso885915[] = { + { 0xA5, 0xA5 }, /* yen sign */ + { 0xA7, 0xA7 }, /* section sign */ + { 0xA9, 0xA9 }, /* copyright sign */ + { 0xAA, 0xAA }, /* feminine ordinal indicator */ + { 0xAB, 0xAB }, /* left-pointing double angle quotation mark */ + { 0xAC, 0xAC }, /* not sign */ + { 0xAD, 0xAD }, /* soft hyphen */ + { 0xAE, 0xAE }, /* registered sign */ + { 0xAF, 0xAF }, /* macron */ + { 0xB0, 0xB0 }, /* degree sign */ + { 0xB1, 0xB1 }, /* plus-minus sign */ + { 0xB2, 0xB2 }, /* superscript two */ + { 0xB3, 0xB3 }, /* superscript three */ + { 0xB5, 0xB5 }, /* micro sign */ + { 0xB6, 0xB6 }, /* pilcrow sign */ + { 0xB7, 0xB7 }, /* middle dot */ + { 0xB9, 0xB9 }, /* superscript one */ + { 0xBA, 0xBA }, /* masculine ordinal indicator */ + { 0xBB, 0xBB }, /* right-pointing double angle quotation mark */ + { 0x152, 0xBC }, /* latin capital ligature oe */ + { 0x153, 0xBD }, /* latin small ligature oe */ + { 0x160, 0xA6 }, /* latin capital letter s with caron */ + { 0x161, 0xA8 }, /* latin small letter s with caron */ + { 0x178, 0xBE }, /* latin capital letter y with diaeresis */ + { 0x17D, 0xB4 }, /* latin capital letter z with caron */ + { 0x17E, 0xB8 }, /* latin small letter z with caron */ + { 0x20AC, 0xA4 }, /* euro sign */ +}; + +static const unicode_mapping unimap_win1252[] = { + { 0x152, 0x8C }, /* latin capital ligature oe */ + { 0x153, 0x9C }, /* latin small ligature oe */ + { 0x160, 0x8A }, /* latin capital letter s with caron */ + { 0x161, 0x9A }, /* latin small letter s with caron */ + { 0x178, 0x9F }, /* latin capital letter y with diaeresis */ + { 0x17D, 0x8E }, /* latin capital letter z with caron */ + { 0x17E, 0x9E }, /* latin small letter z with caron */ + { 0x192, 0x83 }, /* latin small letter f with hook */ + { 0x2C6, 0x88 }, /* modifier letter circumflex accent */ + { 0x2DC, 0x98 }, /* small tilde */ + { 0x2013, 0x96 }, /* en dash */ + { 0x2014, 0x97 }, /* em dash */ + { 0x2018, 0x91 }, /* left single quotation mark */ + { 0x2019, 0x92 }, /* right single quotation mark */ + { 0x201A, 0x82 }, /* single low-9 quotation mark */ + { 0x201C, 0x93 }, /* left double quotation mark */ + { 0x201D, 0x94 }, /* right double quotation mark */ + { 0x201E, 0x84 }, /* double low-9 quotation mark */ + { 0x2020, 0x86 }, /* dagger */ + { 0x2021, 0x87 }, /* double dagger */ + { 0x2022, 0x95 }, /* bullet */ + { 0x2026, 0x85 }, /* horizontal ellipsis */ + { 0x2030, 0x89 }, /* per mille sign */ + { 0x2039, 0x8B }, /* single left-pointing angle quotation mark */ + { 0x203A, 0x9B }, /* single right-pointing angle quotation mark */ + { 0x20AC, 0x80 }, /* euro sign */ + { 0x2122, 0x99 }, /* trade mark sign */ +}; + +static const unicode_mapping unimap_win1251[] = { + { 0xA0, 0xA0 }, /* no-break space */ + { 0xA4, 0xA4 }, /* currency sign */ + { 0xA6, 0xA6 }, /* broken bar */ + { 0xA7, 0xA7 }, /* section sign */ + { 0xA9, 0xA9 }, /* copyright sign */ + { 0xAB, 0xAB }, /* left-pointing double angle quotation mark */ + { 0xAC, 0xAC }, /* not sign */ + { 0xAD, 0xAD }, /* soft hyphen */ + { 0xAE, 0xAE }, /* registered sign */ + { 0xB0, 0xB0 }, /* degree sign */ + { 0xB1, 0xB1 }, /* plus-minus sign */ + { 0xB5, 0xB5 }, /* micro sign */ + { 0xB6, 0xB6 }, /* pilcrow sign */ + { 0xB7, 0xB7 }, /* middle dot */ + { 0xBB, 0xBB }, /* right-pointing double angle quotation mark */ + { 0x401, 0xA8 }, /* cyrillic capital letter io */ + { 0x402, 0x80 }, /* cyrillic capital letter dje */ + { 0x403, 0x81 }, /* cyrillic capital letter gje */ + { 0x404, 0xAA }, /* cyrillic capital letter ukrainian ie */ + { 0x405, 0xBD }, /* cyrillic capital letter dze */ + { 0x406, 0xB2 }, /* cyrillic capital letter byelorussian-ukrainian i */ + { 0x407, 0xAF }, /* cyrillic capital letter yi */ + { 0x408, 0xA3 }, /* cyrillic capital letter je */ + { 0x409, 0x8A }, /* cyrillic capital letter lje */ + { 0x40A, 0x8C }, /* cyrillic capital letter nje */ + { 0x40B, 0x8E }, /* cyrillic capital letter tshe */ + { 0x40C, 0x8D }, /* cyrillic capital letter kje */ + { 0x40E, 0xA1 }, /* cyrillic capital letter short u */ + { 0x40F, 0x8F }, /* cyrillic capital letter dzhe */ + { 0x410, 0xC0 }, /* cyrillic capital letter a */ + { 0x411, 0xC1 }, /* cyrillic capital letter be */ + { 0x412, 0xC2 }, /* cyrillic capital letter ve */ + { 0x413, 0xC3 }, /* cyrillic capital letter ghe */ + { 0x414, 0xC4 }, /* cyrillic capital letter de */ + { 0x415, 0xC5 }, /* cyrillic capital letter ie */ + { 0x416, 0xC6 }, /* cyrillic capital letter zhe */ + { 0x417, 0xC7 }, /* cyrillic capital letter ze */ + { 0x418, 0xC8 }, /* cyrillic capital letter i */ + { 0x419, 0xC9 }, /* cyrillic capital letter short i */ + { 0x41A, 0xCA }, /* cyrillic capital letter ka */ + { 0x41B, 0xCB }, /* cyrillic capital letter el */ + { 0x41C, 0xCC }, /* cyrillic capital letter em */ + { 0x41D, 0xCD }, /* cyrillic capital letter en */ + { 0x41E, 0xCE }, /* cyrillic capital letter o */ + { 0x41F, 0xCF }, /* cyrillic capital letter pe */ + { 0x420, 0xD0 }, /* cyrillic capital letter er */ + { 0x421, 0xD1 }, /* cyrillic capital letter es */ + { 0x422, 0xD2 }, /* cyrillic capital letter te */ + { 0x423, 0xD3 }, /* cyrillic capital letter u */ + { 0x424, 0xD4 }, /* cyrillic capital letter ef */ + { 0x425, 0xD5 }, /* cyrillic capital letter ha */ + { 0x426, 0xD6 }, /* cyrillic capital letter tse */ + { 0x427, 0xD7 }, /* cyrillic capital letter che */ + { 0x428, 0xD8 }, /* cyrillic capital letter sha */ + { 0x429, 0xD9 }, /* cyrillic capital letter shcha */ + { 0x42A, 0xDA }, /* cyrillic capital letter hard sign */ + { 0x42B, 0xDB }, /* cyrillic capital letter yeru */ + { 0x42C, 0xDC }, /* cyrillic capital letter soft sign */ + { 0x42D, 0xDD }, /* cyrillic capital letter e */ + { 0x42E, 0xDE }, /* cyrillic capital letter yu */ + { 0x42F, 0xDF }, /* cyrillic capital letter ya */ + { 0x430, 0xE0 }, /* cyrillic small letter a */ + { 0x431, 0xE1 }, /* cyrillic small letter be */ + { 0x432, 0xE2 }, /* cyrillic small letter ve */ + { 0x433, 0xE3 }, /* cyrillic small letter ghe */ + { 0x434, 0xE4 }, /* cyrillic small letter de */ + { 0x435, 0xE5 }, /* cyrillic small letter ie */ + { 0x436, 0xE6 }, /* cyrillic small letter zhe */ + { 0x437, 0xE7 }, /* cyrillic small letter ze */ + { 0x438, 0xE8 }, /* cyrillic small letter i */ + { 0x439, 0xE9 }, /* cyrillic small letter short i */ + { 0x43A, 0xEA }, /* cyrillic small letter ka */ + { 0x43B, 0xEB }, /* cyrillic small letter el */ + { 0x43C, 0xEC }, /* cyrillic small letter em */ + { 0x43D, 0xED }, /* cyrillic small letter en */ + { 0x43E, 0xEE }, /* cyrillic small letter o */ + { 0x43F, 0xEF }, /* cyrillic small letter pe */ + { 0x440, 0xF0 }, /* cyrillic small letter er */ + { 0x441, 0xF1 }, /* cyrillic small letter es */ + { 0x442, 0xF2 }, /* cyrillic small letter te */ + { 0x443, 0xF3 }, /* cyrillic small letter u */ + { 0x444, 0xF4 }, /* cyrillic small letter ef */ + { 0x445, 0xF5 }, /* cyrillic small letter ha */ + { 0x446, 0xF6 }, /* cyrillic small letter tse */ + { 0x447, 0xF7 }, /* cyrillic small letter che */ + { 0x448, 0xF8 }, /* cyrillic small letter sha */ + { 0x449, 0xF9 }, /* cyrillic small letter shcha */ + { 0x44A, 0xFA }, /* cyrillic small letter hard sign */ + { 0x44B, 0xFB }, /* cyrillic small letter yeru */ + { 0x44C, 0xFC }, /* cyrillic small letter soft sign */ + { 0x44D, 0xFD }, /* cyrillic small letter e */ + { 0x44E, 0xFE }, /* cyrillic small letter yu */ + { 0x44F, 0xFF }, /* cyrillic small letter ya */ + { 0x451, 0xB8 }, /* cyrillic small letter io */ + { 0x452, 0x90 }, /* cyrillic small letter dje */ + { 0x453, 0x83 }, /* cyrillic small letter gje */ + { 0x454, 0xBA }, /* cyrillic small letter ukrainian ie */ + { 0x455, 0xBE }, /* cyrillic small letter dze */ + { 0x456, 0xB3 }, /* cyrillic small letter byelorussian-ukrainian i */ + { 0x457, 0xBF }, /* cyrillic small letter yi */ + { 0x458, 0xBC }, /* cyrillic small letter je */ + { 0x459, 0x9A }, /* cyrillic small letter lje */ + { 0x45A, 0x9C }, /* cyrillic small letter nje */ + { 0x45B, 0x9E }, /* cyrillic small letter tshe */ + { 0x45C, 0x9D }, /* cyrillic small letter kje */ + { 0x45E, 0xA2 }, /* cyrillic small letter short u */ + { 0x45F, 0x9F }, /* cyrillic small letter dzhe */ + { 0x490, 0xA5 }, /* cyrillic capital letter ghe with upturn */ + { 0x491, 0xB4 }, /* cyrillic small letter ghe with upturn */ + { 0x2013, 0x96 }, /* en dash */ + { 0x2014, 0x97 }, /* em dash */ + { 0x2018, 0x91 }, /* left single quotation mark */ + { 0x2019, 0x92 }, /* right single quotation mark */ + { 0x201A, 0x82 }, /* single low-9 quotation mark */ + { 0x201C, 0x93 }, /* left double quotation mark */ + { 0x201D, 0x94 }, /* right double quotation mark */ + { 0x201E, 0x84 }, /* double low-9 quotation mark */ + { 0x2020, 0x86 }, /* dagger */ + { 0x2021, 0x87 }, /* double dagger */ + { 0x2022, 0x95 }, /* bullet */ + { 0x2026, 0x85 }, /* horizontal ellipsis */ + { 0x2030, 0x89 }, /* per mille sign */ + { 0x2039, 0x8B }, /* single left-pointing angle quotation mark */ + { 0x203A, 0x9B }, /* single right-pointing angle quotation mark */ + { 0x20AC, 0x88 }, /* euro sign */ + { 0x2116, 0xB9 }, /* numero sign */ + { 0x2122, 0x99 }, /* trade mark sign */ +}; + +static const unicode_mapping unimap_koi8r[] = { + { 0xA0, 0x9A }, /* no-break space */ + { 0xA9, 0xBF }, /* copyright sign */ + { 0xB0, 0x9C }, /* degree sign */ + { 0xB2, 0x9D }, /* superscript two */ + { 0xB7, 0x9E }, /* middle dot */ + { 0xF7, 0x9F }, /* division sign */ + { 0x401, 0xB3 }, /* cyrillic capital letter io */ + { 0x410, 0xE1 }, /* cyrillic capital letter a */ + { 0x411, 0xE2 }, /* cyrillic capital letter be */ + { 0x412, 0xF7 }, /* cyrillic capital letter ve */ + { 0x413, 0xE7 }, /* cyrillic capital letter ghe */ + { 0x414, 0xE4 }, /* cyrillic capital letter de */ + { 0x415, 0xE5 }, /* cyrillic capital letter ie */ + { 0x416, 0xF6 }, /* cyrillic capital letter zhe */ + { 0x417, 0xFA }, /* cyrillic capital letter ze */ + { 0x418, 0xE9 }, /* cyrillic capital letter i */ + { 0x419, 0xEA }, /* cyrillic capital letter short i */ + { 0x41A, 0xEB }, /* cyrillic capital letter ka */ + { 0x41B, 0xEC }, /* cyrillic capital letter el */ + { 0x41C, 0xED }, /* cyrillic capital letter em */ + { 0x41D, 0xEE }, /* cyrillic capital letter en */ + { 0x41E, 0xEF }, /* cyrillic capital letter o */ + { 0x41F, 0xF0 }, /* cyrillic capital letter pe */ + { 0x420, 0xF2 }, /* cyrillic capital letter er */ + { 0x421, 0xF3 }, /* cyrillic capital letter es */ + { 0x422, 0xF4 }, /* cyrillic capital letter te */ + { 0x423, 0xF5 }, /* cyrillic capital letter u */ + { 0x424, 0xE6 }, /* cyrillic capital letter ef */ + { 0x425, 0xE8 }, /* cyrillic capital letter ha */ + { 0x426, 0xE3 }, /* cyrillic capital letter tse */ + { 0x427, 0xFE }, /* cyrillic capital letter che */ + { 0x428, 0xFB }, /* cyrillic capital letter sha */ + { 0x429, 0xFD }, /* cyrillic capital letter shcha */ + { 0x42A, 0xFF }, /* cyrillic capital letter hard sign */ + { 0x42B, 0xF9 }, /* cyrillic capital letter yeru */ + { 0x42C, 0xF8 }, /* cyrillic capital letter soft sign */ + { 0x42D, 0xFC }, /* cyrillic capital letter e */ + { 0x42E, 0xE0 }, /* cyrillic capital letter yu */ + { 0x42F, 0xF1 }, /* cyrillic capital letter ya */ + { 0x430, 0xC1 }, /* cyrillic small letter a */ + { 0x431, 0xC2 }, /* cyrillic small letter be */ + { 0x432, 0xD7 }, /* cyrillic small letter ve */ + { 0x433, 0xC7 }, /* cyrillic small letter ghe */ + { 0x434, 0xC4 }, /* cyrillic small letter de */ + { 0x435, 0xC5 }, /* cyrillic small letter ie */ + { 0x436, 0xD6 }, /* cyrillic small letter zhe */ + { 0x437, 0xDA }, /* cyrillic small letter ze */ + { 0x438, 0xC9 }, /* cyrillic small letter i */ + { 0x439, 0xCA }, /* cyrillic small letter short i */ + { 0x43A, 0xCB }, /* cyrillic small letter ka */ + { 0x43B, 0xCC }, /* cyrillic small letter el */ + { 0x43C, 0xCD }, /* cyrillic small letter em */ + { 0x43D, 0xCE }, /* cyrillic small letter en */ + { 0x43E, 0xCF }, /* cyrillic small letter o */ + { 0x43F, 0xD0 }, /* cyrillic small letter pe */ + { 0x440, 0xD2 }, /* cyrillic small letter er */ + { 0x441, 0xD3 }, /* cyrillic small letter es */ + { 0x442, 0xD4 }, /* cyrillic small letter te */ + { 0x443, 0xD5 }, /* cyrillic small letter u */ + { 0x444, 0xC6 }, /* cyrillic small letter ef */ + { 0x445, 0xC8 }, /* cyrillic small letter ha */ + { 0x446, 0xC3 }, /* cyrillic small letter tse */ + { 0x447, 0xDE }, /* cyrillic small letter che */ + { 0x448, 0xDB }, /* cyrillic small letter sha */ + { 0x449, 0xDD }, /* cyrillic small letter shcha */ + { 0x44A, 0xDF }, /* cyrillic small letter hard sign */ + { 0x44B, 0xD9 }, /* cyrillic small letter yeru */ + { 0x44C, 0xD8 }, /* cyrillic small letter soft sign */ + { 0x44D, 0xDC }, /* cyrillic small letter e */ + { 0x44E, 0xC0 }, /* cyrillic small letter yu */ + { 0x44F, 0xD1 }, /* cyrillic small letter ya */ + { 0x451, 0xA3 }, /* cyrillic small letter io */ + { 0x2219, 0x95 }, /* bullet operator */ + { 0x221A, 0x96 }, /* square root */ + { 0x2248, 0x97 }, /* almost equal to */ + { 0x2264, 0x98 }, /* less-than or equal to */ + { 0x2265, 0x99 }, /* greater-than or equal to */ + { 0x2320, 0x93 }, /* top half integral */ + { 0x2321, 0x9B }, /* bottom half integral */ + { 0x2500, 0x80 }, /* box drawings light horizontal */ + { 0x2502, 0x81 }, /* box drawings light vertical */ + { 0x250C, 0x82 }, /* box drawings light down and right */ + { 0x2510, 0x83 }, /* box drawings light down and left */ + { 0x2514, 0x84 }, /* box drawings light up and right */ + { 0x2518, 0x85 }, /* box drawings light up and left */ + { 0x251C, 0x86 }, /* box drawings light vertical and right */ + { 0x2524, 0x87 }, /* box drawings light vertical and left */ + { 0x252C, 0x88 }, /* box drawings light down and horizontal */ + { 0x2534, 0x89 }, /* box drawings light up and horizontal */ + { 0x253C, 0x8A }, /* box drawings light vertical and horizontal */ + { 0x2550, 0xA0 }, /* box drawings double horizontal */ + { 0x2551, 0xA1 }, /* box drawings double vertical */ + { 0x2552, 0xA2 }, /* box drawings down single and right double */ + { 0x2553, 0xA4 }, /* box drawings down double and right single */ + { 0x2554, 0xA5 }, /* box drawings double down and right */ + { 0x2555, 0xA6 }, /* box drawings down single and left double */ + { 0x2556, 0xA7 }, /* box drawings down double and left single */ + { 0x2557, 0xA8 }, /* box drawings double down and left */ + { 0x2558, 0xA9 }, /* box drawings up single and right double */ + { 0x2559, 0xAA }, /* box drawings up double and right single */ + { 0x255A, 0xAB }, /* box drawings double up and right */ + { 0x255B, 0xAC }, /* box drawings up single and left double */ + { 0x255C, 0xAD }, /* box drawings up double and left single */ + { 0x255D, 0xAE }, /* box drawings double up and left */ + { 0x255E, 0xAF }, /* box drawings vertical single and right double */ + { 0x255F, 0xB0 }, /* box drawings vertical double and right single */ + { 0x2560, 0xB1 }, /* box drawings double vertical and right */ + { 0x2561, 0xB2 }, /* box drawings vertical single and left double */ + { 0x2562, 0xB4 }, /* box drawings vertical double and left single */ + { 0x2563, 0xB5 }, /* box drawings double vertical and left */ + { 0x2564, 0xB6 }, /* box drawings down single and horizontal double */ + { 0x2565, 0xB7 }, /* box drawings down double and horizontal single */ + { 0x2566, 0xB8 }, /* box drawings double down and horizontal */ + { 0x2567, 0xB9 }, /* box drawings up single and horizontal double */ + { 0x2568, 0xBA }, /* box drawings up double and horizontal single */ + { 0x2569, 0xBB }, /* box drawings double up and horizontal */ + { 0x256A, 0xBC }, /* box drawings vertical single and horizontal double */ + { 0x256B, 0xBD }, /* box drawings vertical double and horizontal single */ + { 0x256C, 0xBE }, /* box drawings double vertical and horizontal */ + { 0x2580, 0x8B }, /* upper half block */ + { 0x2584, 0x8C }, /* lower half block */ + { 0x2588, 0x8D }, /* full block */ + { 0x258C, 0x8E }, /* left half block */ + { 0x2590, 0x8F }, /* right half block */ + { 0x2591, 0x90 }, /* light shade */ + { 0x2592, 0x91 }, /* medium shade */ + { 0x2593, 0x92 }, /* dark shade */ + { 0x25A0, 0x94 }, /* black square */ +}; + +static const unicode_mapping unimap_cp866[] = { + { 0xA0, 0xFF }, /* no-break space */ + { 0xA4, 0xFD }, /* currency sign */ + { 0xB0, 0xF8 }, /* degree sign */ + { 0xB7, 0xFA }, /* middle dot */ + { 0x401, 0xF0 }, /* cyrillic capital letter io */ + { 0x404, 0xF2 }, /* cyrillic capital letter ukrainian ie */ + { 0x407, 0xF4 }, /* cyrillic capital letter yi */ + { 0x40E, 0xF6 }, /* cyrillic capital letter short u */ + { 0x410, 0x80 }, /* cyrillic capital letter a */ + { 0x411, 0x81 }, /* cyrillic capital letter be */ + { 0x412, 0x82 }, /* cyrillic capital letter ve */ + { 0x413, 0x83 }, /* cyrillic capital letter ghe */ + { 0x414, 0x84 }, /* cyrillic capital letter de */ + { 0x415, 0x85 }, /* cyrillic capital letter ie */ + { 0x416, 0x86 }, /* cyrillic capital letter zhe */ + { 0x417, 0x87 }, /* cyrillic capital letter ze */ + { 0x418, 0x88 }, /* cyrillic capital letter i */ + { 0x419, 0x89 }, /* cyrillic capital letter short i */ + { 0x41A, 0x8A }, /* cyrillic capital letter ka */ + { 0x41B, 0x8B }, /* cyrillic capital letter el */ + { 0x41C, 0x8C }, /* cyrillic capital letter em */ + { 0x41D, 0x8D }, /* cyrillic capital letter en */ + { 0x41E, 0x8E }, /* cyrillic capital letter o */ + { 0x41F, 0x8F }, /* cyrillic capital letter pe */ + { 0x420, 0x90 }, /* cyrillic capital letter er */ + { 0x421, 0x91 }, /* cyrillic capital letter es */ + { 0x422, 0x92 }, /* cyrillic capital letter te */ + { 0x423, 0x93 }, /* cyrillic capital letter u */ + { 0x424, 0x94 }, /* cyrillic capital letter ef */ + { 0x425, 0x95 }, /* cyrillic capital letter ha */ + { 0x426, 0x96 }, /* cyrillic capital letter tse */ + { 0x427, 0x97 }, /* cyrillic capital letter che */ + { 0x428, 0x98 }, /* cyrillic capital letter sha */ + { 0x429, 0x99 }, /* cyrillic capital letter shcha */ + { 0x42A, 0x9A }, /* cyrillic capital letter hard sign */ + { 0x42B, 0x9B }, /* cyrillic capital letter yeru */ + { 0x42C, 0x9C }, /* cyrillic capital letter soft sign */ + { 0x42D, 0x9D }, /* cyrillic capital letter e */ + { 0x42E, 0x9F }, /* cyrillic capital letter ya */ + { 0x430, 0xA0 }, /* cyrillic small letter a */ + { 0x431, 0xA1 }, /* cyrillic small letter be */ + { 0x432, 0xA2 }, /* cyrillic small letter ve */ + { 0x433, 0xA3 }, /* cyrillic small letter ghe */ + { 0x434, 0xA4 }, /* cyrillic small letter de */ + { 0x435, 0xA5 }, /* cyrillic small letter ie */ + { 0x436, 0xA6 }, /* cyrillic small letter zhe */ + { 0x437, 0xA7 }, /* cyrillic small letter ze */ + { 0x438, 0xA8 }, /* cyrillic small letter i */ + { 0x439, 0xA9 }, /* cyrillic small letter short i */ + { 0x43A, 0xAA }, /* cyrillic small letter ka */ + { 0x43B, 0xAB }, /* cyrillic small letter el */ + { 0x43C, 0xAC }, /* cyrillic small letter em */ + { 0x43D, 0xAD }, /* cyrillic small letter en */ + { 0x43E, 0xAE }, /* cyrillic small letter o */ + { 0x43F, 0xAF }, /* cyrillic small letter pe */ + { 0x440, 0xE0 }, /* cyrillic small letter er */ + { 0x441, 0xE1 }, /* cyrillic small letter es */ + { 0x442, 0xE2 }, /* cyrillic small letter te */ + { 0x443, 0xE3 }, /* cyrillic small letter u */ + { 0x444, 0xE4 }, /* cyrillic small letter ef */ + { 0x445, 0xE5 }, /* cyrillic small letter ha */ + { 0x446, 0xE6 }, /* cyrillic small letter tse */ + { 0x447, 0xE7 }, /* cyrillic small letter che */ + { 0x448, 0xE8 }, /* cyrillic small letter sha */ + { 0x449, 0xE9 }, /* cyrillic small letter shcha */ + { 0x44A, 0xEA }, /* cyrillic small letter hard sign */ + { 0x44B, 0xEB }, /* cyrillic small letter yeru */ + { 0x44C, 0xEC }, /* cyrillic small letter soft sign */ + { 0x44D, 0xED }, /* cyrillic small letter e */ + { 0x44E, 0xEE }, /* cyrillic small letter yu */ + { 0x44F, 0xEF }, /* cyrillic small letter ya */ + { 0x451, 0xF1 }, /* cyrillic small letter io */ + { 0x454, 0xF3 }, /* cyrillic small letter ukrainian ie */ + { 0x457, 0xF5 }, /* cyrillic small letter yi */ + { 0x45E, 0xF7 }, /* cyrillic small letter short u */ + { 0x2116, 0xFC }, /* numero sign */ + { 0x2219, 0xF9 }, /* bullet operator */ + { 0x221A, 0xFB }, /* square root */ + { 0x2500, 0xC4 }, /* box drawings light horizontal */ + { 0x2502, 0xB3 }, /* box drawings light vertical */ + { 0x250C, 0xDA }, /* box drawings light down and right */ + { 0x2510, 0xBF }, /* box drawings light down and left */ + { 0x2514, 0xC0 }, /* box drawings light up and right */ + { 0x2518, 0xD9 }, /* box drawings light up and left */ + { 0x251C, 0xC3 }, /* box drawings light vertical and right */ + { 0x2524, 0xB4 }, /* box drawings light vertical and left */ + { 0x252C, 0xC2 }, /* box drawings light down and horizontal */ + { 0x2534, 0xC1 }, /* box drawings light up and horizontal */ + { 0x253C, 0xC5 }, /* box drawings light vertical and horizontal */ + { 0x2550, 0xCD }, /* box drawings double horizontal */ + { 0x2551, 0xBA }, /* box drawings double vertical */ + { 0x2552, 0xD5 }, /* box drawings down single and right double */ + { 0x2553, 0xD6 }, /* box drawings down double and right single */ + { 0x2554, 0xC9 }, /* box drawings double down and right */ + { 0x2555, 0xB8 }, /* box drawings down single and left double */ + { 0x2556, 0xB7 }, /* box drawings down double and left single */ + { 0x2557, 0xBB }, /* box drawings double down and left */ + { 0x2558, 0xD4 }, /* box drawings up single and right double */ + { 0x2559, 0xD3 }, /* box drawings up double and right single */ + { 0x255A, 0xC8 }, /* box drawings double up and right */ + { 0x255B, 0xBE }, /* box drawings up single and left double */ + { 0x255C, 0xBD }, /* box drawings up double and left single */ + { 0x255D, 0xBC }, /* box drawings double up and left */ + { 0x255E, 0xC6 }, /* box drawings vertical single and right double */ + { 0x255F, 0xC7 }, /* box drawings vertical double and right single */ + { 0x2560, 0xCC }, /* box drawings double vertical and right */ + { 0x2561, 0xB5 }, /* box drawings vertical single and left double */ + { 0x2562, 0xB6 }, /* box drawings vertical double and left single */ + { 0x2563, 0xB9 }, /* box drawings double vertical and left */ + { 0x2564, 0xD1 }, /* box drawings down single and horizontal double */ + { 0x2565, 0xD2 }, /* box drawings down double and horizontal single */ + { 0x2566, 0xCB }, /* box drawings double down and horizontal */ + { 0x2567, 0xCF }, /* box drawings up single and horizontal double */ + { 0x2568, 0xD0 }, /* box drawings up double and horizontal single */ + { 0x2569, 0xCA }, /* box drawings double up and horizontal */ + { 0x256A, 0xD8 }, /* box drawings vertical single and horizontal double */ + { 0x256B, 0xD7 }, /* box drawings vertical double and horizontal single */ + { 0x256C, 0xCE }, /* box drawings double vertical and horizontal */ + { 0x2580, 0xDF }, /* upper half block */ + { 0x2584, 0xDC }, /* lower half block */ + { 0x2588, 0xDB }, /* full block */ + { 0x258C, 0xDD }, /* left half block */ + { 0x2590, 0xDE }, /* right half block */ + { 0x2591, 0xB0 }, /* light shade */ + { 0x2592, 0xB1 }, /* medium shade */ + { 0x2593, 0xB2 }, /* dark shade */ + { 0x25A0, 0xFE }, /* black square */ +}; + +static const unicode_mapping unimap_macroman[] = { + { 0xA0, 0xCA }, /* no-break space */ + { 0xA1, 0xC1 }, /* inverted exclamation mark */ + { 0xA2, 0xA2 }, /* cent sign */ + { 0xA3, 0xA3 }, /* pound sign */ + { 0xA5, 0xB4 }, /* yen sign */ + { 0xA7, 0xA4 }, /* section sign */ + { 0xA8, 0xAC }, /* diaeresis */ + { 0xA9, 0xA9 }, /* copyright sign */ + { 0xAA, 0xBB }, /* feminine ordinal indicator */ + { 0xAB, 0xC7 }, /* left-pointing double angle quotation mark */ + { 0xAC, 0xC2 }, /* not sign */ + { 0xAE, 0xA8 }, /* registered sign */ + { 0xAF, 0xF8 }, /* macron */ + { 0xB0, 0xA1 }, /* degree sign */ + { 0xB1, 0xB1 }, /* plus-minus sign */ + { 0xB4, 0xAB }, /* acute accent */ + { 0xB5, 0xB5 }, /* micro sign */ + { 0xB6, 0xA6 }, /* pilcrow sign */ + { 0xB7, 0xE1 }, /* middle dot */ + { 0xB8, 0xFC }, /* cedilla */ + { 0xBA, 0xBC }, /* masculine ordinal indicator */ + { 0xBB, 0xC8 }, /* right-pointing double angle quotation mark */ + { 0xBF, 0xC0 }, /* inverted question mark */ + { 0xC0, 0xCB }, /* latin capital letter a with grave */ + { 0xC1, 0xE7 }, /* latin capital letter a with acute */ + { 0xC2, 0xE5 }, /* latin capital letter a with circumflex */ + { 0xC3, 0xCC }, /* latin capital letter a with tilde */ + { 0xC4, 0x80 }, /* latin capital letter a with diaeresis */ + { 0xC5, 0x81 }, /* latin capital letter a with ring above */ + { 0xC6, 0xAE }, /* latin capital letter ae */ + { 0xC7, 0x82 }, /* latin capital letter c with cedilla */ + { 0xC8, 0xE9 }, /* latin capital letter e with grave */ + { 0xC9, 0x83 }, /* latin capital letter e with acute */ + { 0xCA, 0xE6 }, /* latin capital letter e with circumflex */ + { 0xCB, 0xE8 }, /* latin capital letter e with diaeresis */ + { 0xCC, 0xED }, /* latin capital letter i with grave */ + { 0xCD, 0xEA }, /* latin capital letter i with acute */ + { 0xCE, 0xEB }, /* latin capital letter i with circumflex */ + { 0xCF, 0xEC }, /* latin capital letter i with diaeresis */ + { 0xD1, 0x84 }, /* latin capital letter n with tilde */ + { 0xD2, 0xF1 }, /* latin capital letter o with grave */ + { 0xD3, 0xEE }, /* latin capital letter o with acute */ + { 0xD4, 0xEF }, /* latin capital letter o with circumflex */ + { 0xD5, 0xCD }, /* latin capital letter o with tilde */ + { 0xD6, 0x85 }, /* latin capital letter o with diaeresis */ + { 0xD8, 0xAF }, /* latin capital letter o with stroke */ + { 0xD9, 0xF4 }, /* latin capital letter u with grave */ + { 0xDA, 0xF2 }, /* latin capital letter u with acute */ + { 0xDB, 0xF3 }, /* latin capital letter u with circumflex */ + { 0xDC, 0x86 }, /* latin capital letter u with diaeresis */ + { 0xDF, 0xA7 }, /* latin small letter sharp s */ + { 0xE0, 0x88 }, /* latin small letter a with grave */ + { 0xE1, 0x87 }, /* latin small letter a with acute */ + { 0xE2, 0x89 }, /* latin small letter a with circumflex */ + { 0xE3, 0x8B }, /* latin small letter a with tilde */ + { 0xE4, 0x8A }, /* latin small letter a with diaeresis */ + { 0xE5, 0x8C }, /* latin small letter a with ring above */ + { 0xE6, 0xBE }, /* latin small letter ae */ + { 0xE7, 0x8D }, /* latin small letter c with cedilla */ + { 0xE8, 0x8F }, /* latin small letter e with grave */ + { 0xE9, 0x8E }, /* latin small letter e with acute */ + { 0xEA, 0x90 }, /* latin small letter e with circumflex */ + { 0xEB, 0x91 }, /* latin small letter e with diaeresis */ + { 0xEC, 0x93 }, /* latin small letter i with grave */ + { 0xED, 0x92 }, /* latin small letter i with acute */ + { 0xEE, 0x94 }, /* latin small letter i with circumflex */ + { 0xEF, 0x95 }, /* latin small letter i with diaeresis */ + { 0xF1, 0x96 }, /* latin small letter n with tilde */ + { 0xF2, 0x98 }, /* latin small letter o with grave */ + { 0xF3, 0x97 }, /* latin small letter o with acute */ + { 0xF4, 0x99 }, /* latin small letter o with circumflex */ + { 0xF5, 0x9B }, /* latin small letter o with tilde */ + { 0xF6, 0x9A }, /* latin small letter o with diaeresis */ + { 0xF7, 0xD6 }, /* division sign */ + { 0xF8, 0xBF }, /* latin small letter o with stroke */ + { 0xF9, 0x9D }, /* latin small letter u with grave */ + { 0xFA, 0x9C }, /* latin small letter u with acute */ + { 0xFB, 0x9E }, /* latin small letter u with circumflex */ + { 0xFC, 0x9F }, /* latin small letter u with diaeresis */ + { 0xFF, 0xD8 }, /* latin small letter y with diaeresis */ + { 0x131, 0xF5 }, /* latin small letter dotless i */ + { 0x152, 0xCE }, /* latin capital ligature oe */ + { 0x153, 0xCF }, /* latin small ligature oe */ + { 0x178, 0xD9 }, /* latin capital letter y with diaeresis */ + { 0x192, 0xC4 }, /* latin small letter f with hook */ + { 0x2C6, 0xF6 }, /* modifier letter circumflex accent */ + { 0x2C7, 0xFF }, /* caron */ + { 0x2D8, 0xF9 }, /* breve */ + { 0x2D9, 0xFA }, /* dot above */ + { 0x2DA, 0xFB }, /* ring above */ + { 0x2DB, 0xFE }, /* ogonek */ + { 0x2DC, 0xF7 }, /* small tilde */ + { 0x2DD, 0xFD }, /* double acute accent */ + { 0x3A9, 0xBD }, /* greek capital letter omega */ + { 0x3C0, 0xB9 }, /* greek small letter pi */ + { 0x2013, 0xD0 }, /* en dash */ + { 0x2014, 0xD1 }, /* em dash */ + { 0x2018, 0xD4 }, /* left single quotation mark */ + { 0x2019, 0xD5 }, /* right single quotation mark */ + { 0x201A, 0xE2 }, /* single low-9 quotation mark */ + { 0x201C, 0xD2 }, /* left double quotation mark */ + { 0x201D, 0xD3 }, /* right double quotation mark */ + { 0x201E, 0xE3 }, /* double low-9 quotation mark */ + { 0x2020, 0xA0 }, /* dagger */ + { 0x2021, 0xE0 }, /* double dagger */ + { 0x2022, 0xA5 }, /* bullet */ + { 0x2026, 0xC9 }, /* horizontal ellipsis */ + { 0x2030, 0xE4 }, /* per mille sign */ + { 0x2039, 0xDC }, /* single left-pointing angle quotation mark */ + { 0x203A, 0xDD }, /* single right-pointing angle quotation mark */ + { 0x2044, 0xDA }, /* fraction slash */ + { 0x20AC, 0xDB }, /* euro sign */ + { 0x2122, 0xAA }, /* trade mark sign */ + { 0x2202, 0xB6 }, /* partial differential */ + { 0x2206, 0xC6 }, /* increment */ + { 0x220F, 0xB8 }, /* n-ary product */ + { 0x2211, 0xB7 }, /* n-ary summation */ + { 0x221A, 0xC3 }, /* square root */ + { 0x221E, 0xB0 }, /* infinity */ + { 0x222B, 0xBA }, /* integral */ + { 0x2248, 0xC5 }, /* almost equal to */ + { 0x2260, 0xAD }, /* not equal to */ + { 0x2264, 0xB2 }, /* less-than or equal to */ + { 0x2265, 0xB3 }, /* greater-than or equal to */ + { 0x25CA, 0xD7 }, /* lozenge */ + { 0xF8FF, 0xF0 }, /* apple logo */ + { 0xFB01, 0xDE }, /* latin small ligature fi */ + { 0xFB02, 0xDF }, /* latin small ligature fl */ +}; + +#endif /* HTML_TABLES_H */ diff --git a/ext/standard/tests/strings/get_html_translation_table_basic1.phpt b/ext/standard/tests/strings/get_html_translation_table_basic1.phpt index c09388335b..8b6c9afdaa 100644 --- a/ext/standard/tests/strings/get_html_translation_table_basic1.phpt +++ b/ext/standard/tests/strings/get_html_translation_table_basic1.phpt @@ -43,14 +43,14 @@ echo "Done\n"; *** Testing get_html_translation_table() : basic functionality *** -- with default arguments -- array(4) { + ["&"]=> + string(5) "&" ["""]=> string(6) """ ["<"]=> string(4) "<" [">"]=> string(4) ">" - ["&"]=> - string(5) "&" } -- with table = HTML_ENTITIES -- array(171) { @@ -400,13 +400,13 @@ array(171) { } -- with table = HTML_SPECIALCHARS -- array(4) { + ["&"]=> + string(5) "&" ["""]=> string(6) """ ["<"]=> string(4) "<" [">"]=> string(4) ">" - ["&"]=> - string(5) "&" } Done diff --git a/ext/standard/tests/strings/html_entity_decode_cp866.phpt b/ext/standard/tests/strings/html_entity_decode_cp866.phpt new file mode 100644 index 0000000000..94b23b6660 --- /dev/null +++ b/ext/standard/tests/strings/html_entity_decode_cp866.phpt @@ -0,0 +1,533 @@ +--TEST-- +Translation of HTML entities for encoding CP866 +--FILE-- +<?php +$arr = array( +0x0410 => array(0x80, "CYRILLIC CAPITAL LETTER A"), +0x0411 => array(0x81, "CYRILLIC CAPITAL LETTER BE"), +0x0412 => array(0x82, "CYRILLIC CAPITAL LETTER VE"), +0x0413 => array(0x83, "CYRILLIC CAPITAL LETTER GHE"), +0x0414 => array(0x84, "CYRILLIC CAPITAL LETTER DE"), +0x0415 => array(0x85, "CYRILLIC CAPITAL LETTER IE"), +0x0416 => array(0x86, "CYRILLIC CAPITAL LETTER ZHE"), +0x0417 => array(0x87, "CYRILLIC CAPITAL LETTER ZE"), +0x0418 => array(0x88, "CYRILLIC CAPITAL LETTER I"), +0x0419 => array(0x89, "CYRILLIC CAPITAL LETTER SHORT I"), +0x041a => array(0x8a, "CYRILLIC CAPITAL LETTER KA"), +0x041b => array(0x8b, "CYRILLIC CAPITAL LETTER EL"), +0x041c => array(0x8c, "CYRILLIC CAPITAL LETTER EM"), +0x041d => array(0x8d, "CYRILLIC CAPITAL LETTER EN"), +0x041e => array(0x8e, "CYRILLIC CAPITAL LETTER O"), +0x041f => array(0x8f, "CYRILLIC CAPITAL LETTER PE"), +0x0420 => array(0x90, "CYRILLIC CAPITAL LETTER ER"), +0x0421 => array(0x91, "CYRILLIC CAPITAL LETTER ES"), +0x0422 => array(0x92, "CYRILLIC CAPITAL LETTER TE"), +0x0423 => array(0x93, "CYRILLIC CAPITAL LETTER U"), +0x0424 => array(0x94, "CYRILLIC CAPITAL LETTER EF"), +0x0425 => array(0x95, "CYRILLIC CAPITAL LETTER HA"), +0x0426 => array(0x96, "CYRILLIC CAPITAL LETTER TSE"), +0x0427 => array(0x97, "CYRILLIC CAPITAL LETTER CHE"), +0x0428 => array(0x98, "CYRILLIC CAPITAL LETTER SHA"), +0x0429 => array(0x99, "CYRILLIC CAPITAL LETTER SHCHA"), +0x042a => array(0x9a, "CYRILLIC CAPITAL LETTER HARD SIGN"), +0x042b => array(0x9b, "CYRILLIC CAPITAL LETTER YERU"), +0x042c => array(0x9c, "CYRILLIC CAPITAL LETTER SOFT SIGN"), +0x042d => array(0x9d, "CYRILLIC CAPITAL LETTER E"), +0x042e => array(0x9e, "CYRILLIC CAPITAL LETTER YU"), +0x042f => array(0x9f, "CYRILLIC CAPITAL LETTER YA"), +0x0430 => array(0xa0, "CYRILLIC SMALL LETTER A"), +0x0431 => array(0xa1, "CYRILLIC SMALL LETTER BE"), +0x0432 => array(0xa2, "CYRILLIC SMALL LETTER VE"), +0x0433 => array(0xa3, "CYRILLIC SMALL LETTER GHE"), +0x0434 => array(0xa4, "CYRILLIC SMALL LETTER DE"), +0x0435 => array(0xa5, "CYRILLIC SMALL LETTER IE"), +0x0436 => array(0xa6, "CYRILLIC SMALL LETTER ZHE"), +0x0437 => array(0xa7, "CYRILLIC SMALL LETTER ZE"), +0x0438 => array(0xa8, "CYRILLIC SMALL LETTER I"), +0x0439 => array(0xa9, "CYRILLIC SMALL LETTER SHORT I"), +0x043a => array(0xaa, "CYRILLIC SMALL LETTER KA"), +0x043b => array(0xab, "CYRILLIC SMALL LETTER EL"), +0x043c => array(0xac, "CYRILLIC SMALL LETTER EM"), +0x043d => array(0xad, "CYRILLIC SMALL LETTER EN"), +0x043e => array(0xae, "CYRILLIC SMALL LETTER O"), +0x043f => array(0xaf, "CYRILLIC SMALL LETTER PE"), +0x2591 => array(0xb0, "LIGHT SHADE"), +0x2592 => array(0xb1, "MEDIUM SHADE"), +0x2593 => array(0xb2, "DARK SHADE"), +0x2502 => array(0xb3, "BOX DRAWINGS LIGHT VERTICAL"), +0x2524 => array(0xb4, "BOX DRAWINGS LIGHT VERTICAL AND LEFT"), +0x2561 => array(0xb5, "BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE"), +0x2562 => array(0xb6, "BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE"), +0x2556 => array(0xb7, "BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE"), +0x2555 => array(0xb8, "BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE"), +0x2563 => array(0xb9, "BOX DRAWINGS DOUBLE VERTICAL AND LEFT"), +0x2551 => array(0xba, "BOX DRAWINGS DOUBLE VERTICAL"), +0x2557 => array(0xbb, "BOX DRAWINGS DOUBLE DOWN AND LEFT"), +0x255d => array(0xbc, "BOX DRAWINGS DOUBLE UP AND LEFT"), +0x255c => array(0xbd, "BOX DRAWINGS UP DOUBLE AND LEFT SINGLE"), +0x255b => array(0xbe, "BOX DRAWINGS UP SINGLE AND LEFT DOUBLE"), +0x2510 => array(0xbf, "BOX DRAWINGS LIGHT DOWN AND LEFT"), +0x2514 => array(0xc0, "BOX DRAWINGS LIGHT UP AND RIGHT"), +0x2534 => array(0xc1, "BOX DRAWINGS LIGHT UP AND HORIZONTAL"), +0x252c => array(0xc2, "BOX DRAWINGS LIGHT DOWN AND HORIZONTAL"), +0x251c => array(0xc3, "BOX DRAWINGS LIGHT VERTICAL AND RIGHT"), +0x2500 => array(0xc4, "BOX DRAWINGS LIGHT HORIZONTAL"), +0x253c => array(0xc5, "BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL"), +0x255e => array(0xc6, "BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE"), +0x255f => array(0xc7, "BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE"), +0x255a => array(0xc8, "BOX DRAWINGS DOUBLE UP AND RIGHT"), +0x2554 => array(0xc9, "BOX DRAWINGS DOUBLE DOWN AND RIGHT"), +0x2569 => array(0xca, "BOX DRAWINGS DOUBLE UP AND HORIZONTAL"), +0x2566 => array(0xcb, "BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL"), +0x2560 => array(0xcc, "BOX DRAWINGS DOUBLE VERTICAL AND RIGHT"), +0x2550 => array(0xcd, "BOX DRAWINGS DOUBLE HORIZONTAL"), +0x256c => array(0xce, "BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL"), +0x2567 => array(0xcf, "BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE"), +0x2568 => array(0xd0, "BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE"), +0x2564 => array(0xd1, "BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE"), +0x2565 => array(0xd2, "BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE"), +0x2559 => array(0xd3, "BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE"), +0x2558 => array(0xd4, "BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE"), +0x2552 => array(0xd5, "BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE"), +0x2553 => array(0xd6, "BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE"), +0x256b => array(0xd7, "BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE"), +0x256a => array(0xd8, "BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE"), +0x2518 => array(0xd9, "BOX DRAWINGS LIGHT UP AND LEFT"), +0x250c => array(0xda, "BOX DRAWINGS LIGHT DOWN AND RIGHT"), +0x2588 => array(0xdb, "FULL BLOCK"), +0x2584 => array(0xdc, "LOWER HALF BLOCK"), +0x258c => array(0xdd, "LEFT HALF BLOCK"), +0x2590 => array(0xde, "RIGHT HALF BLOCK"), +0x2580 => array(0xdf, "UPPER HALF BLOCK"), +0x0440 => array(0xe0, "CYRILLIC SMALL LETTER ER"), +0x0441 => array(0xe1, "CYRILLIC SMALL LETTER ES"), +0x0442 => array(0xe2, "CYRILLIC SMALL LETTER TE"), +0x0443 => array(0xe3, "CYRILLIC SMALL LETTER U"), +0x0444 => array(0xe4, "CYRILLIC SMALL LETTER EF"), +0x0445 => array(0xe5, "CYRILLIC SMALL LETTER HA"), +0x0446 => array(0xe6, "CYRILLIC SMALL LETTER TSE"), +0x0447 => array(0xe7, "CYRILLIC SMALL LETTER CHE"), +0x0448 => array(0xe8, "CYRILLIC SMALL LETTER SHA"), +0x0449 => array(0xe9, "CYRILLIC SMALL LETTER SHCHA"), +0x044a => array(0xea, "CYRILLIC SMALL LETTER HARD SIGN"), +0x044b => array(0xeb, "CYRILLIC SMALL LETTER YERU"), +0x044c => array(0xec, "CYRILLIC SMALL LETTER SOFT SIGN"), +0x044d => array(0xed, "CYRILLIC SMALL LETTER E"), +0x044e => array(0xee, "CYRILLIC SMALL LETTER YU"), +0x044f => array(0xef, "CYRILLIC SMALL LETTER YA"), +0x0401 => array(0xf0, "CYRILLIC CAPITAL LETTER IO"), +0x0451 => array(0xf1, "CYRILLIC SMALL LETTER IO"), +0x0404 => array(0xf2, "CYRILLIC CAPITAL LETTER UKRAINIAN IE"), +0x0454 => array(0xf3, "CYRILLIC SMALL LETTER UKRAINIAN IE"), +0x0407 => array(0xf4, "CYRILLIC CAPITAL LETTER YI"), +0x0457 => array(0xf5, "CYRILLIC SMALL LETTER YI"), +0x040e => array(0xf6, "CYRILLIC CAPITAL LETTER SHORT U"), +0x045e => array(0xf7, "CYRILLIC SMALL LETTER SHORT U"), +0x00b0 => array(0xf8, "DEGREE SIGN"), +0x2219 => array(0xf9, "BULLET OPERATOR"), +0x00b7 => array(0xfa, "MIDDLE DOT"), +0x221a => array(0xfb, "SQUARE ROOT"), +0x2116 => array(0xfc, "NUMERO SIGN"), +0x00a4 => array(0xfd, "CURRENCY SIGN"), +0x25a0 => array(0xfe, "BLACK SQUARE"), +0x00a0 => array(0xff, "NO-BREAK SPACE"), +); + +foreach ($arr as $u => $v) { + $ent = sprintf("&#x%X;", $u); + $res = html_entity_decode($ent, ENT_QUOTES, 'CP866'); + $d = unpack("H*", $res); + echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]); + + $ent = sprintf("&#x%X;", $v[0]); + $res = html_entity_decode($ent, ENT_QUOTES, 'CP866'); + if ($res[0] != "&" || $res[1] != "#") + $res = unpack("H*", $res)[1]; + echo sprintf("%s => %s\n\n", $ent, $res); +} +--EXPECT-- +CYRILLIC CAPITAL LETTER A: А => 80 +€ => € + +CYRILLIC CAPITAL LETTER BE: Б => 81 + =>  + +CYRILLIC CAPITAL LETTER VE: В => 82 +‚ => ‚ + +CYRILLIC CAPITAL LETTER GHE: Г => 83 +ƒ => ƒ + +CYRILLIC CAPITAL LETTER DE: Д => 84 +„ => „ + +CYRILLIC CAPITAL LETTER IE: Е => 85 +… => … + +CYRILLIC CAPITAL LETTER ZHE: Ж => 86 +† => † + +CYRILLIC CAPITAL LETTER ZE: З => 87 +‡ => ‡ + +CYRILLIC CAPITAL LETTER I: И => 88 +ˆ => ˆ + +CYRILLIC CAPITAL LETTER SHORT I: Й => 89 +‰ => ‰ + +CYRILLIC CAPITAL LETTER KA: К => 8a +Š => Š + +CYRILLIC CAPITAL LETTER EL: Л => 8b +‹ => ‹ + +CYRILLIC CAPITAL LETTER EM: М => 8c +Œ => Œ + +CYRILLIC CAPITAL LETTER EN: Н => 8d + =>  + +CYRILLIC CAPITAL LETTER O: О => 8e +Ž => Ž + +CYRILLIC CAPITAL LETTER PE: П => 8f + =>  + +CYRILLIC CAPITAL LETTER ER: Р => 90 + =>  + +CYRILLIC CAPITAL LETTER ES: С => 91 +‘ => ‘ + +CYRILLIC CAPITAL LETTER TE: Т => 92 +’ => ’ + +CYRILLIC CAPITAL LETTER U: У => 93 +“ => “ + +CYRILLIC CAPITAL LETTER EF: Ф => 94 +” => ” + +CYRILLIC CAPITAL LETTER HA: Х => 95 +• => • + +CYRILLIC CAPITAL LETTER TSE: Ц => 96 +– => – + +CYRILLIC CAPITAL LETTER CHE: Ч => 97 +— => — + +CYRILLIC CAPITAL LETTER SHA: Ш => 98 +˜ => ˜ + +CYRILLIC CAPITAL LETTER SHCHA: Щ => 99 +™ => ™ + +CYRILLIC CAPITAL LETTER HARD SIGN: Ъ => 9a +š => š + +CYRILLIC CAPITAL LETTER YERU: Ы => 9b +› => › + +CYRILLIC CAPITAL LETTER SOFT SIGN: Ь => 9c +œ => œ + +CYRILLIC CAPITAL LETTER E: Э => 9d + =>  + +CYRILLIC CAPITAL LETTER YU: Ю => 9f +ž => ž + +CYRILLIC CAPITAL LETTER YA: Я => 2623783432463b +Ÿ => Ÿ + +CYRILLIC SMALL LETTER A: а => a0 +  => ff + +CYRILLIC SMALL LETTER BE: б => a1 +¡ => ¡ + +CYRILLIC SMALL LETTER VE: в => a2 +¢ => ¢ + +CYRILLIC SMALL LETTER GHE: г => a3 +£ => £ + +CYRILLIC SMALL LETTER DE: д => a4 +¤ => fd + +CYRILLIC SMALL LETTER IE: е => a5 +¥ => ¥ + +CYRILLIC SMALL LETTER ZHE: ж => a6 +¦ => ¦ + +CYRILLIC SMALL LETTER ZE: з => a7 +§ => § + +CYRILLIC SMALL LETTER I: и => a8 +¨ => ¨ + +CYRILLIC SMALL LETTER SHORT I: й => a9 +© => © + +CYRILLIC SMALL LETTER KA: к => aa +ª => ª + +CYRILLIC SMALL LETTER EL: л => ab +« => « + +CYRILLIC SMALL LETTER EM: м => ac +¬ => ¬ + +CYRILLIC SMALL LETTER EN: н => ad +­ => ­ + +CYRILLIC SMALL LETTER O: о => ae +® => ® + +CYRILLIC SMALL LETTER PE: п => af +¯ => ¯ + +LIGHT SHADE: ░ => b0 +° => f8 + +MEDIUM SHADE: ▒ => b1 +± => ± + +DARK SHADE: ▓ => b2 +² => ² + +BOX DRAWINGS LIGHT VERTICAL: │ => b3 +³ => ³ + +BOX DRAWINGS LIGHT VERTICAL AND LEFT: ┤ => b4 +´ => ´ + +BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE: ╡ => b5 +µ => µ + +BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE: ╢ => b6 +¶ => ¶ + +BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE: ╖ => b7 +· => fa + +BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE: ╕ => b8 +¸ => ¸ + +BOX DRAWINGS DOUBLE VERTICAL AND LEFT: ╣ => b9 +¹ => ¹ + +BOX DRAWINGS DOUBLE VERTICAL: ║ => ba +º => º + +BOX DRAWINGS DOUBLE DOWN AND LEFT: ╗ => bb +» => » + +BOX DRAWINGS DOUBLE UP AND LEFT: ╝ => bc +¼ => ¼ + +BOX DRAWINGS UP DOUBLE AND LEFT SINGLE: ╜ => bd +½ => ½ + +BOX DRAWINGS UP SINGLE AND LEFT DOUBLE: ╛ => be +¾ => ¾ + +BOX DRAWINGS LIGHT DOWN AND LEFT: ┐ => bf +¿ => ¿ + +BOX DRAWINGS LIGHT UP AND RIGHT: └ => c0 +À => À + +BOX DRAWINGS LIGHT UP AND HORIZONTAL: ┴ => c1 +Á => Á + +BOX DRAWINGS LIGHT DOWN AND HORIZONTAL: ┬ => c2 + =>  + +BOX DRAWINGS LIGHT VERTICAL AND RIGHT: ├ => c3 +à => à + +BOX DRAWINGS LIGHT HORIZONTAL: ─ => c4 +Ä => Ä + +BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL: ┼ => c5 +Å => Å + +BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE: ╞ => c6 +Æ => Æ + +BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE: ╟ => c7 +Ç => Ç + +BOX DRAWINGS DOUBLE UP AND RIGHT: ╚ => c8 +È => È + +BOX DRAWINGS DOUBLE DOWN AND RIGHT: ╔ => c9 +É => É + +BOX DRAWINGS DOUBLE UP AND HORIZONTAL: ╩ => ca +Ê => Ê + +BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL: ╦ => cb +Ë => Ë + +BOX DRAWINGS DOUBLE VERTICAL AND RIGHT: ╠ => cc +Ì => Ì + +BOX DRAWINGS DOUBLE HORIZONTAL: ═ => cd +Í => Í + +BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL: ╬ => ce +Î => Î + +BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE: ╧ => cf +Ï => Ï + +BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE: ╨ => d0 +Ð => Ð + +BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE: ╤ => d1 +Ñ => Ñ + +BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE: ╥ => d2 +Ò => Ò + +BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE: ╙ => d3 +Ó => Ó + +BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE: ╘ => d4 +Ô => Ô + +BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE: ╒ => d5 +Õ => Õ + +BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE: ╓ => d6 +Ö => Ö + +BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE: ╫ => d7 +× => × + +BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE: ╪ => d8 +Ø => Ø + +BOX DRAWINGS LIGHT UP AND LEFT: ┘ => d9 +Ù => Ù + +BOX DRAWINGS LIGHT DOWN AND RIGHT: ┌ => da +Ú => Ú + +FULL BLOCK: █ => db +Û => Û + +LOWER HALF BLOCK: ▄ => dc +Ü => Ü + +LEFT HALF BLOCK: ▌ => dd +Ý => Ý + +RIGHT HALF BLOCK: ▐ => de +Þ => Þ + +UPPER HALF BLOCK: ▀ => df +ß => ß + +CYRILLIC SMALL LETTER ER: р => e0 +à => à + +CYRILLIC SMALL LETTER ES: с => e1 +á => á + +CYRILLIC SMALL LETTER TE: т => e2 +â => â + +CYRILLIC SMALL LETTER U: у => e3 +ã => ã + +CYRILLIC SMALL LETTER EF: ф => e4 +ä => ä + +CYRILLIC SMALL LETTER HA: х => e5 +å => å + +CYRILLIC SMALL LETTER TSE: ц => e6 +æ => æ + +CYRILLIC SMALL LETTER CHE: ч => e7 +ç => ç + +CYRILLIC SMALL LETTER SHA: ш => e8 +è => è + +CYRILLIC SMALL LETTER SHCHA: щ => e9 +é => é + +CYRILLIC SMALL LETTER HARD SIGN: ъ => ea +ê => ê + +CYRILLIC SMALL LETTER YERU: ы => eb +ë => ë + +CYRILLIC SMALL LETTER SOFT SIGN: ь => ec +ì => ì + +CYRILLIC SMALL LETTER E: э => ed +í => í + +CYRILLIC SMALL LETTER YU: ю => ee +î => î + +CYRILLIC SMALL LETTER YA: я => ef +ï => ï + +CYRILLIC CAPITAL LETTER IO: Ё => f0 +ð => ð + +CYRILLIC SMALL LETTER IO: ё => f1 +ñ => ñ + +CYRILLIC CAPITAL LETTER UKRAINIAN IE: Є => f2 +ò => ò + +CYRILLIC SMALL LETTER UKRAINIAN IE: є => f3 +ó => ó + +CYRILLIC CAPITAL LETTER YI: Ї => f4 +ô => ô + +CYRILLIC SMALL LETTER YI: ї => f5 +õ => õ + +CYRILLIC CAPITAL LETTER SHORT U: Ў => f6 +ö => ö + +CYRILLIC SMALL LETTER SHORT U: ў => f7 +÷ => ÷ + +DEGREE SIGN: ° => f8 +ø => ø + +BULLET OPERATOR: ∙ => f9 +ù => ù + +MIDDLE DOT: · => fa +ú => ú + +SQUARE ROOT: √ => fb +û => û + +NUMERO SIGN: № => fc +ü => ü + +CURRENCY SIGN: ¤ => fd +ý => ý + +BLACK SQUARE: ■ => fe +þ => þ + +NO-BREAK SPACE:   => ff +ÿ => ÿ + + diff --git a/ext/standard/tests/strings/html_entity_decode_iso8859-15.phpt b/ext/standard/tests/strings/html_entity_decode_iso8859-15.phpt new file mode 100644 index 0000000000..a3be8f3668 --- /dev/null +++ b/ext/standard/tests/strings/html_entity_decode_iso8859-15.phpt @@ -0,0 +1,405 @@ +--TEST-- +Translation of HTML entities for encoding ISO-8859-15 +--FILE-- +<?php +$arr = array( +0x00A0 => array(0xA0, "NO-BREAK SPACE"), +0x00A1 => array(0xA1, "INVERTED EXCLAMATION MARK"), +0x00A2 => array(0xA2, "CENT SIGN"), +0x00A3 => array(0xA3, "POUND SIGN"), +0x20AC => array(0xA4, "EURO SIGN"), +0x00A5 => array(0xA5, "YEN SIGN"), +0x0160 => array(0xA6, "LATIN CAPITAL LETTER S WITH CARON"), +0x00A7 => array(0xA7, "SECTION SIGN"), +0x0161 => array(0xA8, "LATIN SMALL LETTER S WITH CARON"), +0x00A9 => array(0xA9, "COPYRIGHT SIGN"), +0x00AA => array(0xAA, "FEMININE ORDINAL INDICATOR"), +0x00AB => array(0xAB, "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK"), +0x00AC => array(0xAC, "NOT SIGN"), +0x00AD => array(0xAD, "SOFT HYPHEN"), +0x00AE => array(0xAE, "REGISTERED SIGN"), +0x00AF => array(0xAF, "MACRON"), +0x00B0 => array(0xB0, "DEGREE SIGN"), +0x00B1 => array(0xB1, "PLUS-MINUS SIGN"), +0x00B2 => array(0xB2, "SUPERSCRIPT TWO"), +0x00B3 => array(0xB3, "SUPERSCRIPT THREE"), +0x017D => array(0xB4, "LATIN CAPITAL LETTER Z WITH CARON"), +0x00B5 => array(0xB5, "MICRO SIGN"), +0x00B6 => array(0xB6, "PILCROW SIGN"), +0x00B7 => array(0xB7, "MIDDLE DOT"), +0x017E => array(0xB8, "LATIN SMALL LETTER Z WITH CARON"), +0x00B9 => array(0xB9, "SUPERSCRIPT ONE"), +0x00BA => array(0xBA, "MASCULINE ORDINAL INDICATOR"), +0x00BB => array(0xBB, "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK"), +0x0152 => array(0xBC, "LATIN CAPITAL LIGATURE OE"), +0x0153 => array(0xBD, "LATIN SMALL LIGATURE OE"), +0x0178 => array(0xBE, "LATIN CAPITAL LETTER Y WITH DIAERESIS"), +0x00BF => array(0xBF, "INVERTED QUESTION MARK"), +0x00C0 => array(0xC0, "LATIN CAPITAL LETTER A WITH GRAVE"), +0x00C1 => array(0xC1, "LATIN CAPITAL LETTER A WITH ACUTE"), +0x00C2 => array(0xC2, "LATIN CAPITAL LETTER A WITH CIRCUMFLEX"), +0x00C3 => array(0xC3, "LATIN CAPITAL LETTER A WITH TILDE"), +0x00C4 => array(0xC4, "LATIN CAPITAL LETTER A WITH DIAERESIS"), +0x00C5 => array(0xC5, "LATIN CAPITAL LETTER A WITH RING ABOVE"), +0x00C6 => array(0xC6, "LATIN CAPITAL LETTER AE"), +0x00C7 => array(0xC7, "LATIN CAPITAL LETTER C WITH CEDILLA"), +0x00C8 => array(0xC8, "LATIN CAPITAL LETTER E WITH GRAVE"), +0x00C9 => array(0xC9, "LATIN CAPITAL LETTER E WITH ACUTE"), +0x00CA => array(0xCA, "LATIN CAPITAL LETTER E WITH CIRCUMFLEX"), +0x00CB => array(0xCB, "LATIN CAPITAL LETTER E WITH DIAERESIS"), +0x00CC => array(0xCC, "LATIN CAPITAL LETTER I WITH GRAVE"), +0x00CD => array(0xCD, "LATIN CAPITAL LETTER I WITH ACUTE"), +0x00CE => array(0xCE, "LATIN CAPITAL LETTER I WITH CIRCUMFLEX"), +0x00CF => array(0xCF, "LATIN CAPITAL LETTER I WITH DIAERESIS"), +0x00D0 => array(0xD0, "LATIN CAPITAL LETTER ETH"), +0x00D1 => array(0xD1, "LATIN CAPITAL LETTER N WITH TILDE"), +0x00D2 => array(0xD2, "LATIN CAPITAL LETTER O WITH GRAVE"), +0x00D3 => array(0xD3, "LATIN CAPITAL LETTER O WITH ACUTE"), +0x00D4 => array(0xD4, "LATIN CAPITAL LETTER O WITH CIRCUMFLEX"), +0x00D5 => array(0xD5, "LATIN CAPITAL LETTER O WITH TILDE"), +0x00D6 => array(0xD6, "LATIN CAPITAL LETTER O WITH DIAERESIS"), +0x00D7 => array(0xD7, "MULTIPLICATION SIGN"), +0x00D8 => array(0xD8, "LATIN CAPITAL LETTER O WITH STROKE"), +0x00D9 => array(0xD9, "LATIN CAPITAL LETTER U WITH GRAVE"), +0x00DA => array(0xDA, "LATIN CAPITAL LETTER U WITH ACUTE"), +0x00DB => array(0xDB, "LATIN CAPITAL LETTER U WITH CIRCUMFLEX"), +0x00DC => array(0xDC, "LATIN CAPITAL LETTER U WITH DIAERESIS"), +0x00DD => array(0xDD, "LATIN CAPITAL LETTER Y WITH ACUTE"), +0x00DE => array(0xDE, "LATIN CAPITAL LETTER THORN"), +0x00DF => array(0xDF, "LATIN SMALL LETTER SHARP S"), +0x00E0 => array(0xE0, "LATIN SMALL LETTER A WITH GRAVE"), +0x00E1 => array(0xE1, "LATIN SMALL LETTER A WITH ACUTE"), +0x00E2 => array(0xE2, "LATIN SMALL LETTER A WITH CIRCUMFLEX"), +0x00E3 => array(0xE3, "LATIN SMALL LETTER A WITH TILDE"), +0x00E4 => array(0xE4, "LATIN SMALL LETTER A WITH DIAERESIS"), +0x00E5 => array(0xE5, "LATIN SMALL LETTER A WITH RING ABOVE"), +0x00E6 => array(0xE6, "LATIN SMALL LETTER AE"), +0x00E7 => array(0xE7, "LATIN SMALL LETTER C WITH CEDILLA"), +0x00E8 => array(0xE8, "LATIN SMALL LETTER E WITH GRAVE"), +0x00E9 => array(0xE9, "LATIN SMALL LETTER E WITH ACUTE"), +0x00EA => array(0xEA, "LATIN SMALL LETTER E WITH CIRCUMFLEX"), +0x00EB => array(0xEB, "LATIN SMALL LETTER E WITH DIAERESIS"), +0x00EC => array(0xEC, "LATIN SMALL LETTER I WITH GRAVE"), +0x00ED => array(0xED, "LATIN SMALL LETTER I WITH ACUTE"), +0x00EE => array(0xEE, "LATIN SMALL LETTER I WITH CIRCUMFLEX"), +0x00EF => array(0xEF, "LATIN SMALL LETTER I WITH DIAERESIS"), +0x00F0 => array(0xF0, "LATIN SMALL LETTER ETH"), +0x00F1 => array(0xF1, "LATIN SMALL LETTER N WITH TILDE"), +0x00F2 => array(0xF2, "LATIN SMALL LETTER O WITH GRAVE"), +0x00F3 => array(0xF3, "LATIN SMALL LETTER O WITH ACUTE"), +0x00F4 => array(0xF4, "LATIN SMALL LETTER O WITH CIRCUMFLEX"), +0x00F5 => array(0xF5, "LATIN SMALL LETTER O WITH TILDE"), +0x00F6 => array(0xF6, "LATIN SMALL LETTER O WITH DIAERESIS"), +0x00F7 => array(0xF7, "DIVISION SIGN"), +0x00F8 => array(0xF8, "LATIN SMALL LETTER O WITH STROKE"), +0x00F9 => array(0xF9, "LATIN SMALL LETTER U WITH GRAVE"), +0x00FA => array(0xFA, "LATIN SMALL LETTER U WITH ACUTE"), +0x00FB => array(0xFB, "LATIN SMALL LETTER U WITH CIRCUMFLEX"), +0x00FC => array(0xFC, "LATIN SMALL LETTER U WITH DIAERESIS"), +0x00FD => array(0xFD, "LATIN SMALL LETTER Y WITH ACUTE"), +0x00FE => array(0xFE, "LATIN SMALL LETTER THORN"), +0x00FF => array(0xFF, "LATIN SMALL LETTER Y WITH DIAERESIS"), +); + +foreach ($arr as $u => $v) { + $ent = sprintf("&#x%X;", $u); + $res = html_entity_decode($ent, ENT_QUOTES, 'ISO-8859-15'); + $d = unpack("H*", $res); + echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]); + + $ent = sprintf("&#x%X;", $v[0]); + $res = html_entity_decode($ent, ENT_QUOTES, 'ISO-8859-15'); + if ($res[0] != "&" || $res[1] != "#") + $res = unpack("H*", $res)[1]; + echo sprintf("%s => %s\n\n", $ent, $res); +} +--EXPECT-- +NO-BREAK SPACE:   => a0 +  => a0 + +INVERTED EXCLAMATION MARK: ¡ => a1 +¡ => a1 + +CENT SIGN: ¢ => a2 +¢ => a2 + +POUND SIGN: £ => a3 +£ => a3 + +EURO SIGN: € => a4 +¤ => ¤ + +YEN SIGN: ¥ => a5 +¥ => a5 + +LATIN CAPITAL LETTER S WITH CARON: Š => a6 +¦ => ¦ + +SECTION SIGN: § => a7 +§ => a7 + +LATIN SMALL LETTER S WITH CARON: š => a8 +¨ => ¨ + +COPYRIGHT SIGN: © => a9 +© => a9 + +FEMININE ORDINAL INDICATOR: ª => aa +ª => aa + +LEFT-POINTING DOUBLE ANGLE QUOTATION MARK: « => ab +« => ab + +NOT SIGN: ¬ => ac +¬ => ac + +SOFT HYPHEN: ­ => ad +­ => ad + +REGISTERED SIGN: ® => ae +® => ae + +MACRON: ¯ => af +¯ => af + +DEGREE SIGN: ° => b0 +° => b0 + +PLUS-MINUS SIGN: ± => b1 +± => b1 + +SUPERSCRIPT TWO: ² => b2 +² => b2 + +SUPERSCRIPT THREE: ³ => b3 +³ => b3 + +LATIN CAPITAL LETTER Z WITH CARON: Ž => b4 +´ => ´ + +MICRO SIGN: µ => b5 +µ => b5 + +PILCROW SIGN: ¶ => b6 +¶ => b6 + +MIDDLE DOT: · => b7 +· => b7 + +LATIN SMALL LETTER Z WITH CARON: ž => b8 +¸ => ¸ + +SUPERSCRIPT ONE: ¹ => b9 +¹ => b9 + +MASCULINE ORDINAL INDICATOR: º => ba +º => ba + +RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK: » => bb +» => bb + +LATIN CAPITAL LIGATURE OE: Œ => bc +¼ => ¼ + +LATIN SMALL LIGATURE OE: œ => bd +½ => ½ + +LATIN CAPITAL LETTER Y WITH DIAERESIS: Ÿ => be +¾ => ¾ + +INVERTED QUESTION MARK: ¿ => bf +¿ => bf + +LATIN CAPITAL LETTER A WITH GRAVE: À => c0 +À => c0 + +LATIN CAPITAL LETTER A WITH ACUTE: Á => c1 +Á => c1 + +LATIN CAPITAL LETTER A WITH CIRCUMFLEX:  => c2 + => c2 + +LATIN CAPITAL LETTER A WITH TILDE: à => c3 +à => c3 + +LATIN CAPITAL LETTER A WITH DIAERESIS: Ä => c4 +Ä => c4 + +LATIN CAPITAL LETTER A WITH RING ABOVE: Å => c5 +Å => c5 + +LATIN CAPITAL LETTER AE: Æ => c6 +Æ => c6 + +LATIN CAPITAL LETTER C WITH CEDILLA: Ç => c7 +Ç => c7 + +LATIN CAPITAL LETTER E WITH GRAVE: È => c8 +È => c8 + +LATIN CAPITAL LETTER E WITH ACUTE: É => c9 +É => c9 + +LATIN CAPITAL LETTER E WITH CIRCUMFLEX: Ê => ca +Ê => ca + +LATIN CAPITAL LETTER E WITH DIAERESIS: Ë => cb +Ë => cb + +LATIN CAPITAL LETTER I WITH GRAVE: Ì => cc +Ì => cc + +LATIN CAPITAL LETTER I WITH ACUTE: Í => cd +Í => cd + +LATIN CAPITAL LETTER I WITH CIRCUMFLEX: Î => ce +Î => ce + +LATIN CAPITAL LETTER I WITH DIAERESIS: Ï => cf +Ï => cf + +LATIN CAPITAL LETTER ETH: Ð => d0 +Ð => d0 + +LATIN CAPITAL LETTER N WITH TILDE: Ñ => d1 +Ñ => d1 + +LATIN CAPITAL LETTER O WITH GRAVE: Ò => d2 +Ò => d2 + +LATIN CAPITAL LETTER O WITH ACUTE: Ó => d3 +Ó => d3 + +LATIN CAPITAL LETTER O WITH CIRCUMFLEX: Ô => d4 +Ô => d4 + +LATIN CAPITAL LETTER O WITH TILDE: Õ => d5 +Õ => d5 + +LATIN CAPITAL LETTER O WITH DIAERESIS: Ö => d6 +Ö => d6 + +MULTIPLICATION SIGN: × => d7 +× => d7 + +LATIN CAPITAL LETTER O WITH STROKE: Ø => d8 +Ø => d8 + +LATIN CAPITAL LETTER U WITH GRAVE: Ù => d9 +Ù => d9 + +LATIN CAPITAL LETTER U WITH ACUTE: Ú => da +Ú => da + +LATIN CAPITAL LETTER U WITH CIRCUMFLEX: Û => db +Û => db + +LATIN CAPITAL LETTER U WITH DIAERESIS: Ü => dc +Ü => dc + +LATIN CAPITAL LETTER Y WITH ACUTE: Ý => dd +Ý => dd + +LATIN CAPITAL LETTER THORN: Þ => de +Þ => de + +LATIN SMALL LETTER SHARP S: ß => df +ß => df + +LATIN SMALL LETTER A WITH GRAVE: à => e0 +à => e0 + +LATIN SMALL LETTER A WITH ACUTE: á => e1 +á => e1 + +LATIN SMALL LETTER A WITH CIRCUMFLEX: â => e2 +â => e2 + +LATIN SMALL LETTER A WITH TILDE: ã => e3 +ã => e3 + +LATIN SMALL LETTER A WITH DIAERESIS: ä => e4 +ä => e4 + +LATIN SMALL LETTER A WITH RING ABOVE: å => e5 +å => e5 + +LATIN SMALL LETTER AE: æ => e6 +æ => e6 + +LATIN SMALL LETTER C WITH CEDILLA: ç => e7 +ç => e7 + +LATIN SMALL LETTER E WITH GRAVE: è => e8 +è => e8 + +LATIN SMALL LETTER E WITH ACUTE: é => e9 +é => e9 + +LATIN SMALL LETTER E WITH CIRCUMFLEX: ê => ea +ê => ea + +LATIN SMALL LETTER E WITH DIAERESIS: ë => eb +ë => eb + +LATIN SMALL LETTER I WITH GRAVE: ì => ec +ì => ec + +LATIN SMALL LETTER I WITH ACUTE: í => ed +í => ed + +LATIN SMALL LETTER I WITH CIRCUMFLEX: î => ee +î => ee + +LATIN SMALL LETTER I WITH DIAERESIS: ï => ef +ï => ef + +LATIN SMALL LETTER ETH: ð => f0 +ð => f0 + +LATIN SMALL LETTER N WITH TILDE: ñ => f1 +ñ => f1 + +LATIN SMALL LETTER O WITH GRAVE: ò => f2 +ò => f2 + +LATIN SMALL LETTER O WITH ACUTE: ó => f3 +ó => f3 + +LATIN SMALL LETTER O WITH CIRCUMFLEX: ô => f4 +ô => f4 + +LATIN SMALL LETTER O WITH TILDE: õ => f5 +õ => f5 + +LATIN SMALL LETTER O WITH DIAERESIS: ö => f6 +ö => f6 + +DIVISION SIGN: ÷ => f7 +÷ => f7 + +LATIN SMALL LETTER O WITH STROKE: ø => f8 +ø => f8 + +LATIN SMALL LETTER U WITH GRAVE: ù => f9 +ù => f9 + +LATIN SMALL LETTER U WITH ACUTE: ú => fa +ú => fa + +LATIN SMALL LETTER U WITH CIRCUMFLEX: û => fb +û => fb + +LATIN SMALL LETTER U WITH DIAERESIS: ü => fc +ü => fc + +LATIN SMALL LETTER Y WITH ACUTE: ý => fd +ý => fd + +LATIN SMALL LETTER THORN: þ => fe +þ => fe + +LATIN SMALL LETTER Y WITH DIAERESIS: ÿ => ff +ÿ => ff + + diff --git a/ext/standard/tests/strings/html_entity_decode_iso8859-5.phpt b/ext/standard/tests/strings/html_entity_decode_iso8859-5.phpt new file mode 100644 index 0000000000..6a65413c9c --- /dev/null +++ b/ext/standard/tests/strings/html_entity_decode_iso8859-5.phpt @@ -0,0 +1,405 @@ +--TEST-- +Translation of HTML entities for encoding ISO-8859-5 +--FILE-- +<?php +$arr = array( +0x00A0 => array(0xA0, "NO-BREAK SPACE"), +0x0401 => array(0xA1, "CYRILLIC CAPITAL LETTER IO"), +0x0402 => array(0xA2, "CYRILLIC CAPITAL LETTER DJE"), +0x0403 => array(0xA3, "CYRILLIC CAPITAL LETTER GJE"), +0x0404 => array(0xA4, "CYRILLIC CAPITAL LETTER UKRAINIAN IE"), +0x0405 => array(0xA5, "CYRILLIC CAPITAL LETTER DZE"), +0x0406 => array(0xA6, "CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I"), +0x0407 => array(0xA7, "CYRILLIC CAPITAL LETTER YI"), +0x0408 => array(0xA8, "CYRILLIC CAPITAL LETTER JE"), +0x0409 => array(0xA9, "CYRILLIC CAPITAL LETTER LJE"), +0x040A => array(0xAA, "CYRILLIC CAPITAL LETTER NJE"), +0x040B => array(0xAB, "CYRILLIC CAPITAL LETTER TSHE"), +0x040C => array(0xAC, "CYRILLIC CAPITAL LETTER KJE"), +0x00AD => array(0xAD, "SOFT HYPHEN"), +0x040E => array(0xAE, "CYRILLIC CAPITAL LETTER SHORT U"), +0x040F => array(0xAF, "CYRILLIC CAPITAL LETTER DZHE"), +0x0410 => array(0xB0, "CYRILLIC CAPITAL LETTER A"), +0x0411 => array(0xB1, "CYRILLIC CAPITAL LETTER BE"), +0x0412 => array(0xB2, "CYRILLIC CAPITAL LETTER VE"), +0x0413 => array(0xB3, "CYRILLIC CAPITAL LETTER GHE"), +0x0414 => array(0xB4, "CYRILLIC CAPITAL LETTER DE"), +0x0415 => array(0xB5, "CYRILLIC CAPITAL LETTER IE"), +0x0416 => array(0xB6, "CYRILLIC CAPITAL LETTER ZHE"), +0x0417 => array(0xB7, "CYRILLIC CAPITAL LETTER ZE"), +0x0418 => array(0xB8, "CYRILLIC CAPITAL LETTER I"), +0x0419 => array(0xB9, "CYRILLIC CAPITAL LETTER SHORT I"), +0x041A => array(0xBA, "CYRILLIC CAPITAL LETTER KA"), +0x041B => array(0xBB, "CYRILLIC CAPITAL LETTER EL"), +0x041C => array(0xBC, "CYRILLIC CAPITAL LETTER EM"), +0x041D => array(0xBD, "CYRILLIC CAPITAL LETTER EN"), +0x041E => array(0xBE, "CYRILLIC CAPITAL LETTER O"), +0x041F => array(0xBF, "CYRILLIC CAPITAL LETTER PE"), +0x0420 => array(0xC0, "CYRILLIC CAPITAL LETTER ER"), +0x0421 => array(0xC1, "CYRILLIC CAPITAL LETTER ES"), +0x0422 => array(0xC2, "CYRILLIC CAPITAL LETTER TE"), +0x0423 => array(0xC3, "CYRILLIC CAPITAL LETTER U"), +0x0424 => array(0xC4, "CYRILLIC CAPITAL LETTER EF"), +0x0425 => array(0xC5, "CYRILLIC CAPITAL LETTER HA"), +0x0426 => array(0xC6, "CYRILLIC CAPITAL LETTER TSE"), +0x0427 => array(0xC7, "CYRILLIC CAPITAL LETTER CHE"), +0x0428 => array(0xC8, "CYRILLIC CAPITAL LETTER SHA"), +0x0429 => array(0xC9, "CYRILLIC CAPITAL LETTER SHCHA"), +0x042A => array(0xCA, "CYRILLIC CAPITAL LETTER HARD SIGN"), +0x042B => array(0xCB, "CYRILLIC CAPITAL LETTER YERU"), +0x042C => array(0xCC, "CYRILLIC CAPITAL LETTER SOFT SIGN"), +0x042D => array(0xCD, "CYRILLIC CAPITAL LETTER E"), +0x042E => array(0xCE, "CYRILLIC CAPITAL LETTER YU"), +0x042F => array(0xCF, "CYRILLIC CAPITAL LETTER YA"), +0x0430 => array(0xD0, "CYRILLIC SMALL LETTER A"), +0x0431 => array(0xD1, "CYRILLIC SMALL LETTER BE"), +0x0432 => array(0xD2, "CYRILLIC SMALL LETTER VE"), +0x0433 => array(0xD3, "CYRILLIC SMALL LETTER GHE"), +0x0434 => array(0xD4, "CYRILLIC SMALL LETTER DE"), +0x0435 => array(0xD5, "CYRILLIC SMALL LETTER IE"), +0x0436 => array(0xD6, "CYRILLIC SMALL LETTER ZHE"), +0x0437 => array(0xD7, "CYRILLIC SMALL LETTER ZE"), +0x0438 => array(0xD8, "CYRILLIC SMALL LETTER I"), +0x0439 => array(0xD9, "CYRILLIC SMALL LETTER SHORT I"), +0x043A => array(0xDA, "CYRILLIC SMALL LETTER KA"), +0x043B => array(0xDB, "CYRILLIC SMALL LETTER EL"), +0x043C => array(0xDC, "CYRILLIC SMALL LETTER EM"), +0x043D => array(0xDD, "CYRILLIC SMALL LETTER EN"), +0x043E => array(0xDE, "CYRILLIC SMALL LETTER O"), +0x043F => array(0xDF, "CYRILLIC SMALL LETTER PE"), +0x0440 => array(0xE0, "CYRILLIC SMALL LETTER ER"), +0x0441 => array(0xE1, "CYRILLIC SMALL LETTER ES"), +0x0442 => array(0xE2, "CYRILLIC SMALL LETTER TE"), +0x0443 => array(0xE3, "CYRILLIC SMALL LETTER U"), +0x0444 => array(0xE4, "CYRILLIC SMALL LETTER EF"), +0x0445 => array(0xE5, "CYRILLIC SMALL LETTER HA"), +0x0446 => array(0xE6, "CYRILLIC SMALL LETTER TSE"), +0x0447 => array(0xE7, "CYRILLIC SMALL LETTER CHE"), +0x0448 => array(0xE8, "CYRILLIC SMALL LETTER SHA"), +0x0449 => array(0xE9, "CYRILLIC SMALL LETTER SHCHA"), +0x044A => array(0xEA, "CYRILLIC SMALL LETTER HARD SIGN"), +0x044B => array(0xEB, "CYRILLIC SMALL LETTER YERU"), +0x044C => array(0xEC, "CYRILLIC SMALL LETTER SOFT SIGN"), +0x044D => array(0xED, "CYRILLIC SMALL LETTER E"), +0x044E => array(0xEE, "CYRILLIC SMALL LETTER YU"), +0x044F => array(0xEF, "CYRILLIC SMALL LETTER YA"), +0x2116 => array(0xF0, "NUMERO SIGN"), +0x0451 => array(0xF1, "CYRILLIC SMALL LETTER IO"), +0x0452 => array(0xF2, "CYRILLIC SMALL LETTER DJE"), +0x0453 => array(0xF3, "CYRILLIC SMALL LETTER GJE"), +0x0454 => array(0xF4, "CYRILLIC SMALL LETTER UKRAINIAN IE"), +0x0455 => array(0xF5, "CYRILLIC SMALL LETTER DZE"), +0x0456 => array(0xF6, "CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I"), +0x0457 => array(0xF7, "CYRILLIC SMALL LETTER YI"), +0x0458 => array(0xF8, "CYRILLIC SMALL LETTER JE"), +0x0459 => array(0xF9, "CYRILLIC SMALL LETTER LJE"), +0x045A => array(0xFA, "CYRILLIC SMALL LETTER NJE"), +0x045B => array(0xFB, "CYRILLIC SMALL LETTER TSHE"), +0x045C => array(0xFC, "CYRILLIC SMALL LETTER KJE"), +0x00A7 => array(0xFD, "SECTION SIGN"), +0x045E => array(0xFE, "CYRILLIC SMALL LETTER SHORT U"), +0x045F => array(0xFF, "CYRILLIC SMALL LETTER DZHE"), +); + +foreach ($arr as $u => $v) { + $ent = sprintf("&#x%X;", $u); + $res = html_entity_decode($ent, ENT_QUOTES, 'ISO-8859-5'); + $d = unpack("H*", $res); + echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]); + + $ent = sprintf("&#x%X;", $v[0]); + $res = html_entity_decode($ent, ENT_QUOTES, 'ISO-8859-5'); + if ($res[0] != "&" || $res[1] != "#") + $res = unpack("H*", $res)[1]; + echo sprintf("%s => %s\n\n", $ent, $res); +} +--EXPECT-- +NO-BREAK SPACE:   => a0 +  => a0 + +CYRILLIC CAPITAL LETTER IO: Ё => a1 +¡ => ¡ + +CYRILLIC CAPITAL LETTER DJE: Ђ => a2 +¢ => ¢ + +CYRILLIC CAPITAL LETTER GJE: Ѓ => a3 +£ => £ + +CYRILLIC CAPITAL LETTER UKRAINIAN IE: Є => a4 +¤ => ¤ + +CYRILLIC CAPITAL LETTER DZE: Ѕ => a5 +¥ => ¥ + +CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I: І => a6 +¦ => ¦ + +CYRILLIC CAPITAL LETTER YI: Ї => a7 +§ => fd + +CYRILLIC CAPITAL LETTER JE: Ј => a8 +¨ => ¨ + +CYRILLIC CAPITAL LETTER LJE: Љ => a9 +© => © + +CYRILLIC CAPITAL LETTER NJE: Њ => aa +ª => ª + +CYRILLIC CAPITAL LETTER TSHE: Ћ => ab +« => « + +CYRILLIC CAPITAL LETTER KJE: Ќ => ac +¬ => ¬ + +SOFT HYPHEN: ­ => ad +­ => ad + +CYRILLIC CAPITAL LETTER SHORT U: Ў => ae +® => ® + +CYRILLIC CAPITAL LETTER DZHE: Џ => af +¯ => ¯ + +CYRILLIC CAPITAL LETTER A: А => b0 +° => ° + +CYRILLIC CAPITAL LETTER BE: Б => b1 +± => ± + +CYRILLIC CAPITAL LETTER VE: В => b2 +² => ² + +CYRILLIC CAPITAL LETTER GHE: Г => b3 +³ => ³ + +CYRILLIC CAPITAL LETTER DE: Д => b4 +´ => ´ + +CYRILLIC CAPITAL LETTER IE: Е => b5 +µ => µ + +CYRILLIC CAPITAL LETTER ZHE: Ж => b6 +¶ => ¶ + +CYRILLIC CAPITAL LETTER ZE: З => b7 +· => · + +CYRILLIC CAPITAL LETTER I: И => b8 +¸ => ¸ + +CYRILLIC CAPITAL LETTER SHORT I: Й => b9 +¹ => ¹ + +CYRILLIC CAPITAL LETTER KA: К => ba +º => º + +CYRILLIC CAPITAL LETTER EL: Л => bb +» => » + +CYRILLIC CAPITAL LETTER EM: М => bc +¼ => ¼ + +CYRILLIC CAPITAL LETTER EN: Н => bd +½ => ½ + +CYRILLIC CAPITAL LETTER O: О => be +¾ => ¾ + +CYRILLIC CAPITAL LETTER PE: П => bf +¿ => ¿ + +CYRILLIC CAPITAL LETTER ER: Р => c0 +À => À + +CYRILLIC CAPITAL LETTER ES: С => c1 +Á => Á + +CYRILLIC CAPITAL LETTER TE: Т => c2 + =>  + +CYRILLIC CAPITAL LETTER U: У => c3 +à => à + +CYRILLIC CAPITAL LETTER EF: Ф => c4 +Ä => Ä + +CYRILLIC CAPITAL LETTER HA: Х => c5 +Å => Å + +CYRILLIC CAPITAL LETTER TSE: Ц => c6 +Æ => Æ + +CYRILLIC CAPITAL LETTER CHE: Ч => c7 +Ç => Ç + +CYRILLIC CAPITAL LETTER SHA: Ш => c8 +È => È + +CYRILLIC CAPITAL LETTER SHCHA: Щ => c9 +É => É + +CYRILLIC CAPITAL LETTER HARD SIGN: Ъ => ca +Ê => Ê + +CYRILLIC CAPITAL LETTER YERU: Ы => cb +Ë => Ë + +CYRILLIC CAPITAL LETTER SOFT SIGN: Ь => cc +Ì => Ì + +CYRILLIC CAPITAL LETTER E: Э => cd +Í => Í + +CYRILLIC CAPITAL LETTER YU: Ю => ce +Î => Î + +CYRILLIC CAPITAL LETTER YA: Я => cf +Ï => Ï + +CYRILLIC SMALL LETTER A: а => d0 +Ð => Ð + +CYRILLIC SMALL LETTER BE: б => d1 +Ñ => Ñ + +CYRILLIC SMALL LETTER VE: в => d2 +Ò => Ò + +CYRILLIC SMALL LETTER GHE: г => d3 +Ó => Ó + +CYRILLIC SMALL LETTER DE: д => d4 +Ô => Ô + +CYRILLIC SMALL LETTER IE: е => d5 +Õ => Õ + +CYRILLIC SMALL LETTER ZHE: ж => d6 +Ö => Ö + +CYRILLIC SMALL LETTER ZE: з => d7 +× => × + +CYRILLIC SMALL LETTER I: и => d8 +Ø => Ø + +CYRILLIC SMALL LETTER SHORT I: й => d9 +Ù => Ù + +CYRILLIC SMALL LETTER KA: к => da +Ú => Ú + +CYRILLIC SMALL LETTER EL: л => db +Û => Û + +CYRILLIC SMALL LETTER EM: м => dc +Ü => Ü + +CYRILLIC SMALL LETTER EN: н => dd +Ý => Ý + +CYRILLIC SMALL LETTER O: о => de +Þ => Þ + +CYRILLIC SMALL LETTER PE: п => df +ß => ß + +CYRILLIC SMALL LETTER ER: р => e0 +à => à + +CYRILLIC SMALL LETTER ES: с => e1 +á => á + +CYRILLIC SMALL LETTER TE: т => e2 +â => â + +CYRILLIC SMALL LETTER U: у => e3 +ã => ã + +CYRILLIC SMALL LETTER EF: ф => e4 +ä => ä + +CYRILLIC SMALL LETTER HA: х => e5 +å => å + +CYRILLIC SMALL LETTER TSE: ц => e6 +æ => æ + +CYRILLIC SMALL LETTER CHE: ч => e7 +ç => ç + +CYRILLIC SMALL LETTER SHA: ш => e8 +è => è + +CYRILLIC SMALL LETTER SHCHA: щ => e9 +é => é + +CYRILLIC SMALL LETTER HARD SIGN: ъ => ea +ê => ê + +CYRILLIC SMALL LETTER YERU: ы => eb +ë => ë + +CYRILLIC SMALL LETTER SOFT SIGN: ь => ec +ì => ì + +CYRILLIC SMALL LETTER E: э => ed +í => í + +CYRILLIC SMALL LETTER YU: ю => ee +î => î + +CYRILLIC SMALL LETTER YA: я => ef +ï => ï + +NUMERO SIGN: № => f0 +ð => ð + +CYRILLIC SMALL LETTER IO: ё => 2623783435313b +ñ => ñ + +CYRILLIC SMALL LETTER DJE: ђ => 2623783435323b +ò => ò + +CYRILLIC SMALL LETTER GJE: ѓ => 2623783435333b +ó => ó + +CYRILLIC SMALL LETTER UKRAINIAN IE: є => 2623783435343b +ô => ô + +CYRILLIC SMALL LETTER DZE: ѕ => 2623783435353b +õ => õ + +CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I: і => 2623783435363b +ö => ö + +CYRILLIC SMALL LETTER YI: ї => 2623783435373b +÷ => ÷ + +CYRILLIC SMALL LETTER JE: ј => 2623783435383b +ø => ø + +CYRILLIC SMALL LETTER LJE: љ => 2623783435393b +ù => ù + +CYRILLIC SMALL LETTER NJE: њ => 2623783435413b +ú => ú + +CYRILLIC SMALL LETTER TSHE: ћ => 2623783435423b +û => û + +CYRILLIC SMALL LETTER KJE: ќ => 2623783435433b +ü => ü + +SECTION SIGN: § => fd +ý => ý + +CYRILLIC SMALL LETTER SHORT U: ў => 2623783435453b +þ => þ + +CYRILLIC SMALL LETTER DZHE: џ => 2623783435463b +ÿ => ÿ + + diff --git a/ext/standard/tests/strings/html_entity_decode_koi8-r.phpt b/ext/standard/tests/strings/html_entity_decode_koi8-r.phpt new file mode 100644 index 0000000000..cb7fc7d1d8 --- /dev/null +++ b/ext/standard/tests/strings/html_entity_decode_koi8-r.phpt @@ -0,0 +1,533 @@ +--TEST-- +Translation of HTML entities for encoding KOI8-R +--FILE-- +<?php +$arr = array( +0x2500 => array(0x80, "BOX DRAWINGS LIGHT HORIZONTAL"), +0x2502 => array(0x81, "BOX DRAWINGS LIGHT VERTICAL"), +0x250C => array(0x82, "BOX DRAWINGS LIGHT DOWN AND RIGHT"), +0x2510 => array(0x83, "BOX DRAWINGS LIGHT DOWN AND LEFT"), +0x2514 => array(0x84, "BOX DRAWINGS LIGHT UP AND RIGHT"), +0x2518 => array(0x85, "BOX DRAWINGS LIGHT UP AND LEFT"), +0x251C => array(0x86, "BOX DRAWINGS LIGHT VERTICAL AND RIGHT"), +0x2524 => array(0x87, "BOX DRAWINGS LIGHT VERTICAL AND LEFT"), +0x252C => array(0x88, "BOX DRAWINGS LIGHT DOWN AND HORIZONTAL"), +0x2534 => array(0x89, "BOX DRAWINGS LIGHT UP AND HORIZONTAL"), +0x253C => array(0x8A, "BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL"), +0x2580 => array(0x8B, "UPPER HALF BLOCK"), +0x2584 => array(0x8C, "LOWER HALF BLOCK"), +0x2588 => array(0x8D, "FULL BLOCK"), +0x258C => array(0x8E, "LEFT HALF BLOCK"), +0x2590 => array(0x8F, "RIGHT HALF BLOCK"), +0x2591 => array(0x90, "LIGHT SHADE"), +0x2592 => array(0x91, "MEDIUM SHADE"), +0x2593 => array(0x92, "DARK SHADE"), +0x2320 => array(0x93, "TOP HALF INTEGRAL"), +0x25A0 => array(0x94, "BLACK SQUARE"), +0x2219 => array(0x95, "BULLET OPERATOR"), +0x221A => array(0x96, "SQUARE ROOT"), +0x2248 => array(0x97, "ALMOST EQUAL TO"), +0x2264 => array(0x98, "LESS-THAN OR EQUAL TO"), +0x2265 => array(0x99, "GREATER-THAN OR EQUAL TO"), +0x00A0 => array(0x9A, "NO-BREAK SPACE"), +0x2321 => array(0x9B, "BOTTOM HALF INTEGRAL"), +0x00B0 => array(0x9C, "DEGREE SIGN"), +0x00B2 => array(0x9D, "SUPERSCRIPT TWO"), +0x00B7 => array(0x9E, "MIDDLE DOT"), +0x00F7 => array(0x9F, "DIVISION SIGN"), +0x2550 => array(0xA0, "BOX DRAWINGS DOUBLE HORIZONTAL"), +0x2551 => array(0xA1, "BOX DRAWINGS DOUBLE VERTICAL"), +0x2552 => array(0xA2, "BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE"), +0x0451 => array(0xA3, "CYRILLIC SMALL LETTER IO"), +0x2553 => array(0xA4, "BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE"), +0x2554 => array(0xA5, "BOX DRAWINGS DOUBLE DOWN AND RIGHT"), +0x2555 => array(0xA6, "BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE"), +0x2556 => array(0xA7, "BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE"), +0x2557 => array(0xA8, "BOX DRAWINGS DOUBLE DOWN AND LEFT"), +0x2558 => array(0xA9, "BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE"), +0x2559 => array(0xAA, "BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE"), +0x255A => array(0xAB, "BOX DRAWINGS DOUBLE UP AND RIGHT"), +0x255B => array(0xAC, "BOX DRAWINGS UP SINGLE AND LEFT DOUBLE"), +0x255C => array(0xAD, "BOX DRAWINGS UP DOUBLE AND LEFT SINGLE"), +0x255D => array(0xAE, "BOX DRAWINGS DOUBLE UP AND LEFT"), +0x255E => array(0xAF, "BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE"), +0x255F => array(0xB0, "BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE"), +0x2560 => array(0xB1, "BOX DRAWINGS DOUBLE VERTICAL AND RIGHT"), +0x2561 => array(0xB2, "BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE"), +0x0401 => array(0xB3, "CYRILLIC CAPITAL LETTER IO"), +0x2562 => array(0xB4, "BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE"), +0x2563 => array(0xB5, "BOX DRAWINGS DOUBLE VERTICAL AND LEFT"), +0x2564 => array(0xB6, "BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE"), +0x2565 => array(0xB7, "BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE"), +0x2566 => array(0xB8, "BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL"), +0x2567 => array(0xB9, "BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE"), +0x2568 => array(0xBA, "BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE"), +0x2569 => array(0xBB, "BOX DRAWINGS DOUBLE UP AND HORIZONTAL"), +0x256A => array(0xBC, "BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE"), +0x256B => array(0xBD, "BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE"), +0x256C => array(0xBE, "BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL"), +0x00A9 => array(0xBF, "COPYRIGHT SIGN"), +0x044E => array(0xC0, "CYRILLIC SMALL LETTER YU"), +0x0430 => array(0xC1, "CYRILLIC SMALL LETTER A"), +0x0431 => array(0xC2, "CYRILLIC SMALL LETTER BE"), +0x0446 => array(0xC3, "CYRILLIC SMALL LETTER TSE"), +0x0434 => array(0xC4, "CYRILLIC SMALL LETTER DE"), +0x0435 => array(0xC5, "CYRILLIC SMALL LETTER IE"), +0x0444 => array(0xC6, "CYRILLIC SMALL LETTER EF"), +0x0433 => array(0xC7, "CYRILLIC SMALL LETTER GHE"), +0x0445 => array(0xC8, "CYRILLIC SMALL LETTER HA"), +0x0438 => array(0xC9, "CYRILLIC SMALL LETTER I"), +0x0439 => array(0xCA, "CYRILLIC SMALL LETTER SHORT I"), +0x043A => array(0xCB, "CYRILLIC SMALL LETTER KA"), +0x043B => array(0xCC, "CYRILLIC SMALL LETTER EL"), +0x043C => array(0xCD, "CYRILLIC SMALL LETTER EM"), +0x043D => array(0xCE, "CYRILLIC SMALL LETTER EN"), +0x043E => array(0xCF, "CYRILLIC SMALL LETTER O"), +0x043F => array(0xD0, "CYRILLIC SMALL LETTER PE"), +0x044F => array(0xD1, "CYRILLIC SMALL LETTER YA"), +0x0440 => array(0xD2, "CYRILLIC SMALL LETTER ER"), +0x0441 => array(0xD3, "CYRILLIC SMALL LETTER ES"), +0x0442 => array(0xD4, "CYRILLIC SMALL LETTER TE"), +0x0443 => array(0xD5, "CYRILLIC SMALL LETTER U"), +0x0436 => array(0xD6, "CYRILLIC SMALL LETTER ZHE"), +0x0432 => array(0xD7, "CYRILLIC SMALL LETTER VE"), +0x044C => array(0xD8, "CYRILLIC SMALL LETTER SOFT SIGN"), +0x044B => array(0xD9, "CYRILLIC SMALL LETTER YERU"), +0x0437 => array(0xDA, "CYRILLIC SMALL LETTER ZE"), +0x0448 => array(0xDB, "CYRILLIC SMALL LETTER SHA"), +0x044D => array(0xDC, "CYRILLIC SMALL LETTER E"), +0x0449 => array(0xDD, "CYRILLIC SMALL LETTER SHCHA"), +0x0447 => array(0xDE, "CYRILLIC SMALL LETTER CHE"), +0x044A => array(0xDF, "CYRILLIC SMALL LETTER HARD SIGN"), +0x042E => array(0xE0, "CYRILLIC CAPITAL LETTER YU"), +0x0410 => array(0xE1, "CYRILLIC CAPITAL LETTER A"), +0x0411 => array(0xE2, "CYRILLIC CAPITAL LETTER BE"), +0x0426 => array(0xE3, "CYRILLIC CAPITAL LETTER TSE"), +0x0414 => array(0xE4, "CYRILLIC CAPITAL LETTER DE"), +0x0415 => array(0xE5, "CYRILLIC CAPITAL LETTER IE"), +0x0424 => array(0xE6, "CYRILLIC CAPITAL LETTER EF"), +0x0413 => array(0xE7, "CYRILLIC CAPITAL LETTER GHE"), +0x0425 => array(0xE8, "CYRILLIC CAPITAL LETTER HA"), +0x0418 => array(0xE9, "CYRILLIC CAPITAL LETTER I"), +0x0419 => array(0xEA, "CYRILLIC CAPITAL LETTER SHORT I"), +0x041A => array(0xEB, "CYRILLIC CAPITAL LETTER KA"), +0x041B => array(0xEC, "CYRILLIC CAPITAL LETTER EL"), +0x041C => array(0xED, "CYRILLIC CAPITAL LETTER EM"), +0x041D => array(0xEE, "CYRILLIC CAPITAL LETTER EN"), +0x041E => array(0xEF, "CYRILLIC CAPITAL LETTER O"), +0x041F => array(0xF0, "CYRILLIC CAPITAL LETTER PE"), +0x042F => array(0xF1, "CYRILLIC CAPITAL LETTER YA"), +0x0420 => array(0xF2, "CYRILLIC CAPITAL LETTER ER"), +0x0421 => array(0xF3, "CYRILLIC CAPITAL LETTER ES"), +0x0422 => array(0xF4, "CYRILLIC CAPITAL LETTER TE"), +0x0423 => array(0xF5, "CYRILLIC CAPITAL LETTER U"), +0x0416 => array(0xF6, "CYRILLIC CAPITAL LETTER ZHE"), +0x0412 => array(0xF7, "CYRILLIC CAPITAL LETTER VE"), +0x042C => array(0xF8, "CYRILLIC CAPITAL LETTER SOFT SIGN"), +0x042B => array(0xF9, "CYRILLIC CAPITAL LETTER YERU"), +0x0417 => array(0xFA, "CYRILLIC CAPITAL LETTER ZE"), +0x0428 => array(0xFB, "CYRILLIC CAPITAL LETTER SHA"), +0x042D => array(0xFC, "CYRILLIC CAPITAL LETTER E"), +0x0429 => array(0xFD, "CYRILLIC CAPITAL LETTER SHCHA"), +0x0427 => array(0xFE, "CYRILLIC CAPITAL LETTER CHE"), +0x042A => array(0xFF, "CYRILLIC CAPITAL LETTER HARD SIGN"), +); + +foreach ($arr as $u => $v) { + $ent = sprintf("&#x%X;", $u); + $res = html_entity_decode($ent, ENT_QUOTES, 'KOI8-R'); + $d = unpack("H*", $res); + echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]); + + $ent = sprintf("&#x%X;", $v[0]); + $res = html_entity_decode($ent, ENT_QUOTES, 'KOI8-R'); + if ($res[0] != "&" || $res[1] != "#") + $res = unpack("H*", $res)[1]; + echo sprintf("%s => %s\n\n", $ent, $res); +} +--EXPECT-- +BOX DRAWINGS LIGHT HORIZONTAL: ─ => 80 +€ => € + +BOX DRAWINGS LIGHT VERTICAL: │ => 81 + =>  + +BOX DRAWINGS LIGHT DOWN AND RIGHT: ┌ => 82 +‚ => ‚ + +BOX DRAWINGS LIGHT DOWN AND LEFT: ┐ => 83 +ƒ => ƒ + +BOX DRAWINGS LIGHT UP AND RIGHT: └ => 84 +„ => „ + +BOX DRAWINGS LIGHT UP AND LEFT: ┘ => 85 +… => … + +BOX DRAWINGS LIGHT VERTICAL AND RIGHT: ├ => 86 +† => † + +BOX DRAWINGS LIGHT VERTICAL AND LEFT: ┤ => 87 +‡ => ‡ + +BOX DRAWINGS LIGHT DOWN AND HORIZONTAL: ┬ => 88 +ˆ => ˆ + +BOX DRAWINGS LIGHT UP AND HORIZONTAL: ┴ => 89 +‰ => ‰ + +BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL: ┼ => 8a +Š => Š + +UPPER HALF BLOCK: ▀ => 8b +‹ => ‹ + +LOWER HALF BLOCK: ▄ => 8c +Œ => Œ + +FULL BLOCK: █ => 8d + =>  + +LEFT HALF BLOCK: ▌ => 8e +Ž => Ž + +RIGHT HALF BLOCK: ▐ => 8f + =>  + +LIGHT SHADE: ░ => 90 + =>  + +MEDIUM SHADE: ▒ => 91 +‘ => ‘ + +DARK SHADE: ▓ => 92 +’ => ’ + +TOP HALF INTEGRAL: ⌠ => 93 +“ => “ + +BLACK SQUARE: ■ => 94 +” => ” + +BULLET OPERATOR: ∙ => 95 +• => • + +SQUARE ROOT: √ => 96 +– => – + +ALMOST EQUAL TO: ≈ => 97 +— => — + +LESS-THAN OR EQUAL TO: ≤ => 98 +˜ => ˜ + +GREATER-THAN OR EQUAL TO: ≥ => 99 +™ => ™ + +NO-BREAK SPACE:   => 9a +š => š + +BOTTOM HALF INTEGRAL: ⌡ => 9b +› => › + +DEGREE SIGN: ° => 9c +œ => œ + +SUPERSCRIPT TWO: ² => 9d + =>  + +MIDDLE DOT: · => 9e +ž => ž + +DIVISION SIGN: ÷ => 9f +Ÿ => Ÿ + +BOX DRAWINGS DOUBLE HORIZONTAL: ═ => a0 +  => 9a + +BOX DRAWINGS DOUBLE VERTICAL: ║ => a1 +¡ => ¡ + +BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE: ╒ => a2 +¢ => ¢ + +CYRILLIC SMALL LETTER IO: ё => a3 +£ => £ + +BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE: ╓ => a4 +¤ => ¤ + +BOX DRAWINGS DOUBLE DOWN AND RIGHT: ╔ => a5 +¥ => ¥ + +BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE: ╕ => a6 +¦ => ¦ + +BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE: ╖ => a7 +§ => § + +BOX DRAWINGS DOUBLE DOWN AND LEFT: ╗ => a8 +¨ => ¨ + +BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE: ╘ => a9 +© => bf + +BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE: ╙ => aa +ª => ª + +BOX DRAWINGS DOUBLE UP AND RIGHT: ╚ => ab +« => « + +BOX DRAWINGS UP SINGLE AND LEFT DOUBLE: ╛ => ac +¬ => ¬ + +BOX DRAWINGS UP DOUBLE AND LEFT SINGLE: ╜ => ad +­ => ­ + +BOX DRAWINGS DOUBLE UP AND LEFT: ╝ => ae +® => ® + +BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE: ╞ => af +¯ => ¯ + +BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE: ╟ => b0 +° => 9c + +BOX DRAWINGS DOUBLE VERTICAL AND RIGHT: ╠ => b1 +± => ± + +BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE: ╡ => b2 +² => 9d + +CYRILLIC CAPITAL LETTER IO: Ё => b3 +³ => ³ + +BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE: ╢ => b4 +´ => ´ + +BOX DRAWINGS DOUBLE VERTICAL AND LEFT: ╣ => b5 +µ => µ + +BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE: ╤ => b6 +¶ => ¶ + +BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE: ╥ => b7 +· => 9e + +BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL: ╦ => b8 +¸ => ¸ + +BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE: ╧ => b9 +¹ => ¹ + +BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE: ╨ => ba +º => º + +BOX DRAWINGS DOUBLE UP AND HORIZONTAL: ╩ => bb +» => » + +BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE: ╪ => bc +¼ => ¼ + +BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE: ╫ => bd +½ => ½ + +BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL: ╬ => be +¾ => ¾ + +COPYRIGHT SIGN: © => bf +¿ => ¿ + +CYRILLIC SMALL LETTER YU: ю => c0 +À => À + +CYRILLIC SMALL LETTER A: а => c1 +Á => Á + +CYRILLIC SMALL LETTER BE: б => c2 + =>  + +CYRILLIC SMALL LETTER TSE: ц => c3 +à => à + +CYRILLIC SMALL LETTER DE: д => c4 +Ä => Ä + +CYRILLIC SMALL LETTER IE: е => c5 +Å => Å + +CYRILLIC SMALL LETTER EF: ф => c6 +Æ => Æ + +CYRILLIC SMALL LETTER GHE: г => c7 +Ç => Ç + +CYRILLIC SMALL LETTER HA: х => c8 +È => È + +CYRILLIC SMALL LETTER I: и => c9 +É => É + +CYRILLIC SMALL LETTER SHORT I: й => ca +Ê => Ê + +CYRILLIC SMALL LETTER KA: к => cb +Ë => Ë + +CYRILLIC SMALL LETTER EL: л => cc +Ì => Ì + +CYRILLIC SMALL LETTER EM: м => cd +Í => Í + +CYRILLIC SMALL LETTER EN: н => ce +Î => Î + +CYRILLIC SMALL LETTER O: о => cf +Ï => Ï + +CYRILLIC SMALL LETTER PE: п => d0 +Ð => Ð + +CYRILLIC SMALL LETTER YA: я => d1 +Ñ => Ñ + +CYRILLIC SMALL LETTER ER: р => d2 +Ò => Ò + +CYRILLIC SMALL LETTER ES: с => d3 +Ó => Ó + +CYRILLIC SMALL LETTER TE: т => d4 +Ô => Ô + +CYRILLIC SMALL LETTER U: у => d5 +Õ => Õ + +CYRILLIC SMALL LETTER ZHE: ж => d6 +Ö => Ö + +CYRILLIC SMALL LETTER VE: в => d7 +× => × + +CYRILLIC SMALL LETTER SOFT SIGN: ь => d8 +Ø => Ø + +CYRILLIC SMALL LETTER YERU: ы => d9 +Ù => Ù + +CYRILLIC SMALL LETTER ZE: з => da +Ú => Ú + +CYRILLIC SMALL LETTER SHA: ш => db +Û => Û + +CYRILLIC SMALL LETTER E: э => dc +Ü => Ü + +CYRILLIC SMALL LETTER SHCHA: щ => dd +Ý => Ý + +CYRILLIC SMALL LETTER CHE: ч => de +Þ => Þ + +CYRILLIC SMALL LETTER HARD SIGN: ъ => df +ß => ß + +CYRILLIC CAPITAL LETTER YU: Ю => e0 +à => à + +CYRILLIC CAPITAL LETTER A: А => e1 +á => á + +CYRILLIC CAPITAL LETTER BE: Б => e2 +â => â + +CYRILLIC CAPITAL LETTER TSE: Ц => e3 +ã => ã + +CYRILLIC CAPITAL LETTER DE: Д => e4 +ä => ä + +CYRILLIC CAPITAL LETTER IE: Е => e5 +å => å + +CYRILLIC CAPITAL LETTER EF: Ф => e6 +æ => æ + +CYRILLIC CAPITAL LETTER GHE: Г => e7 +ç => ç + +CYRILLIC CAPITAL LETTER HA: Х => e8 +è => è + +CYRILLIC CAPITAL LETTER I: И => e9 +é => é + +CYRILLIC CAPITAL LETTER SHORT I: Й => ea +ê => ê + +CYRILLIC CAPITAL LETTER KA: К => eb +ë => ë + +CYRILLIC CAPITAL LETTER EL: Л => ec +ì => ì + +CYRILLIC CAPITAL LETTER EM: М => ed +í => í + +CYRILLIC CAPITAL LETTER EN: Н => ee +î => î + +CYRILLIC CAPITAL LETTER O: О => ef +ï => ï + +CYRILLIC CAPITAL LETTER PE: П => f0 +ð => ð + +CYRILLIC CAPITAL LETTER YA: Я => f1 +ñ => ñ + +CYRILLIC CAPITAL LETTER ER: Р => f2 +ò => ò + +CYRILLIC CAPITAL LETTER ES: С => f3 +ó => ó + +CYRILLIC CAPITAL LETTER TE: Т => f4 +ô => ô + +CYRILLIC CAPITAL LETTER U: У => f5 +õ => õ + +CYRILLIC CAPITAL LETTER ZHE: Ж => f6 +ö => ö + +CYRILLIC CAPITAL LETTER VE: В => f7 +÷ => 9f + +CYRILLIC CAPITAL LETTER SOFT SIGN: Ь => f8 +ø => ø + +CYRILLIC CAPITAL LETTER YERU: Ы => f9 +ù => ù + +CYRILLIC CAPITAL LETTER ZE: З => fa +ú => ú + +CYRILLIC CAPITAL LETTER SHA: Ш => fb +û => û + +CYRILLIC CAPITAL LETTER E: Э => fc +ü => ü + +CYRILLIC CAPITAL LETTER SHCHA: Щ => fd +ý => ý + +CYRILLIC CAPITAL LETTER CHE: Ч => fe +þ => þ + +CYRILLIC CAPITAL LETTER HARD SIGN: Ъ => ff +ÿ => ÿ + + diff --git a/ext/standard/tests/strings/html_entity_decode_macroman.phpt b/ext/standard/tests/strings/html_entity_decode_macroman.phpt new file mode 100644 index 0000000000..4691bcf1a7 --- /dev/null +++ b/ext/standard/tests/strings/html_entity_decode_macroman.phpt @@ -0,0 +1,540 @@ +--TEST-- +Translation of HTML entities for encoding MacRoman +--FILE-- +<?php +$arr = array( +0x00C4 => array(0x80, "LATIN CAPITAL LETTER A WITH DIAERESIS"), +0x00C5 => array(0x81, "LATIN CAPITAL LETTER A WITH RING ABOVE"), +0x00C7 => array(0x82, "LATIN CAPITAL LETTER C WITH CEDILLA"), +0x00C9 => array(0x83, "LATIN CAPITAL LETTER E WITH ACUTE"), +0x00D1 => array(0x84, "LATIN CAPITAL LETTER N WITH TILDE"), +0x00D6 => array(0x85, "LATIN CAPITAL LETTER O WITH DIAERESIS"), +0x00DC => array(0x86, "LATIN CAPITAL LETTER U WITH DIAERESIS"), +0x00E1 => array(0x87, "LATIN SMALL LETTER A WITH ACUTE"), +0x00E0 => array(0x88, "LATIN SMALL LETTER A WITH GRAVE"), +0x00E2 => array(0x89, "LATIN SMALL LETTER A WITH CIRCUMFLEX"), +0x00E4 => array(0x8A, "LATIN SMALL LETTER A WITH DIAERESIS"), +0x00E3 => array(0x8B, "LATIN SMALL LETTER A WITH TILDE"), +0x00E5 => array(0x8C, "LATIN SMALL LETTER A WITH RING ABOVE"), +0x00E7 => array(0x8D, "LATIN SMALL LETTER C WITH CEDILLA"), +0x00E9 => array(0x8E, "LATIN SMALL LETTER E WITH ACUTE"), +0x00E8 => array(0x8F, "LATIN SMALL LETTER E WITH GRAVE"), +0x00EA => array(0x90, "LATIN SMALL LETTER E WITH CIRCUMFLEX"), +0x00EB => array(0x91, "LATIN SMALL LETTER E WITH DIAERESIS"), +0x00ED => array(0x92, "LATIN SMALL LETTER I WITH ACUTE"), +0x00EC => array(0x93, "LATIN SMALL LETTER I WITH GRAVE"), +0x00EE => array(0x94, "LATIN SMALL LETTER I WITH CIRCUMFLEX"), +0x00EF => array(0x95, "LATIN SMALL LETTER I WITH DIAERESIS"), +0x00F1 => array(0x96, "LATIN SMALL LETTER N WITH TILDE"), +0x00F3 => array(0x97, "LATIN SMALL LETTER O WITH ACUTE"), +0x00F2 => array(0x98, "LATIN SMALL LETTER O WITH GRAVE"), +0x00F4 => array(0x99, "LATIN SMALL LETTER O WITH CIRCUMFLEX"), +0x00F6 => array(0x9A, "LATIN SMALL LETTER O WITH DIAERESIS"), +0x00F5 => array(0x9B, "LATIN SMALL LETTER O WITH TILDE"), +0x00FA => array(0x9C, "LATIN SMALL LETTER U WITH ACUTE"), +0x00F9 => array(0x9D, "LATIN SMALL LETTER U WITH GRAVE"), +0x00FB => array(0x9E, "LATIN SMALL LETTER U WITH CIRCUMFLEX"), +0x00FC => array(0x9F, "LATIN SMALL LETTER U WITH DIAERESIS"), +0x2020 => array(0xA0, "DAGGER"), +0x00B0 => array(0xA1, "DEGREE SIGN"), +0x00A2 => array(0xA2, "CENT SIGN"), +0x00A3 => array(0xA3, "POUND SIGN"), +0x00A7 => array(0xA4, "SECTION SIGN"), +0x2022 => array(0xA5, "BULLET"), +0x00B6 => array(0xA6, "PILCROW SIGN"), +0x00DF => array(0xA7, "LATIN SMALL LETTER SHARP S"), +0x00AE => array(0xA8, "REGISTERED SIGN"), +0x00A9 => array(0xA9, "COPYRIGHT SIGN"), +0x2122 => array(0xAA, "TRADE MARK SIGN"), +0x00B4 => array(0xAB, "ACUTE ACCENT"), +0x00A8 => array(0xAC, "DIAERESIS"), +0x2260 => array(0xAD, "NOT EQUAL TO"), +0x00C6 => array(0xAE, "LATIN CAPITAL LETTER AE"), +0x00D8 => array(0xAF, "LATIN CAPITAL LETTER O WITH STROKE"), +0x221E => array(0xB0, "INFINITY"), +0x00B1 => array(0xB1, "PLUS-MINUS SIGN"), +0x2264 => array(0xB2, "LESS-THAN OR EQUAL TO"), +0x2265 => array(0xB3, "GREATER-THAN OR EQUAL TO"), +0x00A5 => array(0xB4, "YEN SIGN"), +0x00B5 => array(0xB5, "MICRO SIGN"), +0x2202 => array(0xB6, "PARTIAL DIFFERENTIAL"), +0x2211 => array(0xB7, "N-ARY SUMMATION"), +0x220F => array(0xB8, "N-ARY PRODUCT"), +0x03C0 => array(0xB9, "GREEK SMALL LETTER PI"), +0x222B => array(0xBA, "INTEGRAL"), +0x00AA => array(0xBB, "FEMININE ORDINAL INDICATOR"), +0x00BA => array(0xBC, "MASCULINE ORDINAL INDICATOR"), +0x03A9 => array(0xBD, "GREEK CAPITAL LETTER OMEGA"), +0x00E6 => array(0xBE, "LATIN SMALL LETTER AE"), +0x00F8 => array(0xBF, "LATIN SMALL LETTER O WITH STROKE"), +0x00BF => array(0xC0, "INVERTED QUESTION MARK"), +0x00A1 => array(0xC1, "INVERTED EXCLAMATION MARK"), +0x00AC => array(0xC2, "NOT SIGN"), +0x221A => array(0xC3, "SQUARE ROOT"), +0x0192 => array(0xC4, "LATIN SMALL LETTER F WITH HOOK"), +0x2248 => array(0xC5, "ALMOST EQUAL TO"), +0x2206 => array(0xC6, "INCREMENT"), +0x00AB => array(0xC7, "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK"), +0x00BB => array(0xC8, "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK"), +0x2026 => array(0xC9, "HORIZONTAL ELLIPSIS"), +0x00A0 => array(0xCA, "NO-BREAK SPACE"), +0x00C0 => array(0xCB, "LATIN CAPITAL LETTER A WITH GRAVE"), +0x00C3 => array(0xCC, "LATIN CAPITAL LETTER A WITH TILDE"), +0x00D5 => array(0xCD, "LATIN CAPITAL LETTER O WITH TILDE"), +0x0152 => array(0xCE, "LATIN CAPITAL LIGATURE OE"), +0x0153 => array(0xCF, "LATIN SMALL LIGATURE OE"), +0x2013 => array(0xD0, "EN DASH"), +0x2014 => array(0xD1, "EM DASH"), +0x201C => array(0xD2, "LEFT DOUBLE QUOTATION MARK"), +0x201D => array(0xD3, "RIGHT DOUBLE QUOTATION MARK"), +0x2018 => array(0xD4, "LEFT SINGLE QUOTATION MARK"), +0x2019 => array(0xD5, "RIGHT SINGLE QUOTATION MARK"), +0x00F7 => array(0xD6, "DIVISION SIGN"), +0x25CA => array(0xD7, "LOZENGE"), +0x00FF => array(0xD8, "LATIN SMALL LETTER Y WITH DIAERESIS"), +0x0178 => array(0xD9, "LATIN CAPITAL LETTER Y WITH DIAERESIS"), +0x2044 => array(0xDA, "FRACTION SLASH"), +0x20AC => array(0xDB, "EURO SIGN"), +0x2039 => array(0xDC, "SINGLE LEFT-POINTING ANGLE QUOTATION MARK"), +0x203A => array(0xDD, "SINGLE RIGHT-POINTING ANGLE QUOTATION MARK"), +0xFB01 => array(0xDE, "LATIN SMALL LIGATURE FI"), +0xFB02 => array(0xDF, "LATIN SMALL LIGATURE FL"), +0x2021 => array(0xE0, "DOUBLE DAGGER"), +0x00B7 => array(0xE1, "MIDDLE DOT"), +0x201A => array(0xE2, "SINGLE LOW-9 QUOTATION MARK"), +0x201E => array(0xE3, "DOUBLE LOW-9 QUOTATION MARK"), +0x2030 => array(0xE4, "PER MILLE SIGN"), +0x00C2 => array(0xE5, "LATIN CAPITAL LETTER A WITH CIRCUMFLEX"), +0x00CA => array(0xE6, "LATIN CAPITAL LETTER E WITH CIRCUMFLEX"), +0x00C1 => array(0xE7, "LATIN CAPITAL LETTER A WITH ACUTE"), +0x00CB => array(0xE8, "LATIN CAPITAL LETTER E WITH DIAERESIS"), +0x00C8 => array(0xE9, "LATIN CAPITAL LETTER E WITH GRAVE"), +0x00CD => array(0xEA, "LATIN CAPITAL LETTER I WITH ACUTE"), +0x00CE => array(0xEB, "LATIN CAPITAL LETTER I WITH CIRCUMFLEX"), +0x00CF => array(0xEC, "LATIN CAPITAL LETTER I WITH DIAERESIS"), +0x00CC => array(0xED, "LATIN CAPITAL LETTER I WITH GRAVE"), +0x00D3 => array(0xEE, "LATIN CAPITAL LETTER O WITH ACUTE"), +0x00D4 => array(0xEF, "LATIN CAPITAL LETTER O WITH CIRCUMFLEX"), +0xF8FF => array(0xF0, "Apple logo"), +0x00D2 => array(0xF1, "LATIN CAPITAL LETTER O WITH GRAVE"), +0x00DA => array(0xF2, "LATIN CAPITAL LETTER U WITH ACUTE"), +0x00DB => array(0xF3, "LATIN CAPITAL LETTER U WITH CIRCUMFLEX"), +0x00D9 => array(0xF4, "LATIN CAPITAL LETTER U WITH GRAVE"), +0x0131 => array(0xF5, "LATIN SMALL LETTER DOTLESS I"), +0x02C6 => array(0xF6, "MODIFIER LETTER CIRCUMFLEX ACCENT"), +0x02DC => array(0xF7, "SMALL TILDE"), +0x00AF => array(0xF8, "MACRON"), +0x02D8 => array(0xF9, "BREVE"), +0x02D9 => array(0xFA, "DOT ABOVE"), +0x02DA => array(0xFB, "RING ABOVE"), +0x00B8 => array(0xFC, "CEDILLA"), +0x02DD => array(0xFD, "DOUBLE ACUTE ACCENT"), +0x02DB => array(0xFE, "OGONEK"), +0x02C7 => array(0xFF, "CARON"), +); + +$res = html_entity_decode("", ENT_QUOTES, 'MacRoman'); +echo "Special test for  (shouldn't decode):\n"; +echo $res,"\n\n"; + +foreach ($arr as $u => $v) { + $ent = sprintf("&#x%X;", $u); + $res = html_entity_decode($ent, ENT_QUOTES, 'MacRoman'); + $d = unpack("H*", $res); + echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]); + + $ent = sprintf("&#x%X;", $v[0]); + $res = html_entity_decode($ent, ENT_QUOTES, 'MacRoman'); + if ($res[0] != "&" || $res[1] != "#") + $res = unpack("H*", $res)[1]; + echo sprintf("%s => %s\n\n", $ent, $res); +} +--EXPECT-- +Special test for  (shouldn't decode): + + +LATIN CAPITAL LETTER A WITH DIAERESIS: Ä => 80 +€ => € + +LATIN CAPITAL LETTER A WITH RING ABOVE: Å => 81 + =>  + +LATIN CAPITAL LETTER C WITH CEDILLA: Ç => 82 +‚ => ‚ + +LATIN CAPITAL LETTER E WITH ACUTE: É => 83 +ƒ => ƒ + +LATIN CAPITAL LETTER N WITH TILDE: Ñ => 84 +„ => „ + +LATIN CAPITAL LETTER O WITH DIAERESIS: Ö => 85 +… => … + +LATIN CAPITAL LETTER U WITH DIAERESIS: Ü => 86 +† => † + +LATIN SMALL LETTER A WITH ACUTE: á => 87 +‡ => ‡ + +LATIN SMALL LETTER A WITH GRAVE: à => 88 +ˆ => ˆ + +LATIN SMALL LETTER A WITH CIRCUMFLEX: â => 89 +‰ => ‰ + +LATIN SMALL LETTER A WITH DIAERESIS: ä => 8a +Š => Š + +LATIN SMALL LETTER A WITH TILDE: ã => 8b +‹ => ‹ + +LATIN SMALL LETTER A WITH RING ABOVE: å => 8c +Œ => Œ + +LATIN SMALL LETTER C WITH CEDILLA: ç => 8d + =>  + +LATIN SMALL LETTER E WITH ACUTE: é => 8e +Ž => Ž + +LATIN SMALL LETTER E WITH GRAVE: è => 8f + =>  + +LATIN SMALL LETTER E WITH CIRCUMFLEX: ê => 90 + =>  + +LATIN SMALL LETTER E WITH DIAERESIS: ë => 91 +‘ => ‘ + +LATIN SMALL LETTER I WITH ACUTE: í => 92 +’ => ’ + +LATIN SMALL LETTER I WITH GRAVE: ì => 93 +“ => “ + +LATIN SMALL LETTER I WITH CIRCUMFLEX: î => 94 +” => ” + +LATIN SMALL LETTER I WITH DIAERESIS: ï => 95 +• => • + +LATIN SMALL LETTER N WITH TILDE: ñ => 96 +– => – + +LATIN SMALL LETTER O WITH ACUTE: ó => 97 +— => — + +LATIN SMALL LETTER O WITH GRAVE: ò => 98 +˜ => ˜ + +LATIN SMALL LETTER O WITH CIRCUMFLEX: ô => 99 +™ => ™ + +LATIN SMALL LETTER O WITH DIAERESIS: ö => 9a +š => š + +LATIN SMALL LETTER O WITH TILDE: õ => 9b +› => › + +LATIN SMALL LETTER U WITH ACUTE: ú => 9c +œ => œ + +LATIN SMALL LETTER U WITH GRAVE: ù => 9d + =>  + +LATIN SMALL LETTER U WITH CIRCUMFLEX: û => 9e +ž => ž + +LATIN SMALL LETTER U WITH DIAERESIS: ü => 9f +Ÿ => Ÿ + +DAGGER: † => a0 +  => ca + +DEGREE SIGN: ° => a1 +¡ => c1 + +CENT SIGN: ¢ => a2 +¢ => a2 + +POUND SIGN: £ => a3 +£ => a3 + +SECTION SIGN: § => a4 +¤ => ¤ + +BULLET: • => a5 +¥ => b4 + +PILCROW SIGN: ¶ => a6 +¦ => ¦ + +LATIN SMALL LETTER SHARP S: ß => a7 +§ => a4 + +REGISTERED SIGN: ® => a8 +¨ => ac + +COPYRIGHT SIGN: © => a9 +© => a9 + +TRADE MARK SIGN: ™ => aa +ª => bb + +ACUTE ACCENT: ´ => ab +« => c7 + +DIAERESIS: ¨ => ac +¬ => c2 + +NOT EQUAL TO: ≠ => ad +­ => ­ + +LATIN CAPITAL LETTER AE: Æ => ae +® => a8 + +LATIN CAPITAL LETTER O WITH STROKE: Ø => af +¯ => f8 + +INFINITY: ∞ => b0 +° => a1 + +PLUS-MINUS SIGN: ± => b1 +± => b1 + +LESS-THAN OR EQUAL TO: ≤ => b2 +² => ² + +GREATER-THAN OR EQUAL TO: ≥ => b3 +³ => ³ + +YEN SIGN: ¥ => b4 +´ => ab + +MICRO SIGN: µ => b5 +µ => b5 + +PARTIAL DIFFERENTIAL: ∂ => b6 +¶ => a6 + +N-ARY SUMMATION: ∑ => b7 +· => e1 + +N-ARY PRODUCT: ∏ => b8 +¸ => fc + +GREEK SMALL LETTER PI: π => b9 +¹ => ¹ + +INTEGRAL: ∫ => ba +º => bc + +FEMININE ORDINAL INDICATOR: ª => bb +» => c8 + +MASCULINE ORDINAL INDICATOR: º => bc +¼ => ¼ + +GREEK CAPITAL LETTER OMEGA: Ω => bd +½ => ½ + +LATIN SMALL LETTER AE: æ => be +¾ => ¾ + +LATIN SMALL LETTER O WITH STROKE: ø => bf +¿ => c0 + +INVERTED QUESTION MARK: ¿ => c0 +À => cb + +INVERTED EXCLAMATION MARK: ¡ => c1 +Á => e7 + +NOT SIGN: ¬ => c2 + => e5 + +SQUARE ROOT: √ => c3 +à => cc + +LATIN SMALL LETTER F WITH HOOK: ƒ => c4 +Ä => 80 + +ALMOST EQUAL TO: ≈ => c5 +Å => 81 + +INCREMENT: ∆ => c6 +Æ => ae + +LEFT-POINTING DOUBLE ANGLE QUOTATION MARK: « => c7 +Ç => 82 + +RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK: » => c8 +È => e9 + +HORIZONTAL ELLIPSIS: … => c9 +É => 83 + +NO-BREAK SPACE:   => ca +Ê => e6 + +LATIN CAPITAL LETTER A WITH GRAVE: À => cb +Ë => e8 + +LATIN CAPITAL LETTER A WITH TILDE: à => cc +Ì => ed + +LATIN CAPITAL LETTER O WITH TILDE: Õ => cd +Í => ea + +LATIN CAPITAL LIGATURE OE: Œ => ce +Î => eb + +LATIN SMALL LIGATURE OE: œ => cf +Ï => ec + +EN DASH: – => d0 +Ð => Ð + +EM DASH: — => d1 +Ñ => 84 + +LEFT DOUBLE QUOTATION MARK: “ => d2 +Ò => f1 + +RIGHT DOUBLE QUOTATION MARK: ” => d3 +Ó => ee + +LEFT SINGLE QUOTATION MARK: ‘ => d4 +Ô => ef + +RIGHT SINGLE QUOTATION MARK: ’ => d5 +Õ => cd + +DIVISION SIGN: ÷ => d6 +Ö => 85 + +LOZENGE: ◊ => d7 +× => × + +LATIN SMALL LETTER Y WITH DIAERESIS: ÿ => d8 +Ø => af + +LATIN CAPITAL LETTER Y WITH DIAERESIS: Ÿ => d9 +Ù => f4 + +FRACTION SLASH: ⁄ => da +Ú => f2 + +EURO SIGN: € => db +Û => f3 + +SINGLE LEFT-POINTING ANGLE QUOTATION MARK: ‹ => dc +Ü => 86 + +SINGLE RIGHT-POINTING ANGLE QUOTATION MARK: › => dd +Ý => Ý + +LATIN SMALL LIGATURE FI: fi => de +Þ => Þ + +LATIN SMALL LIGATURE FL: fl => df +ß => a7 + +DOUBLE DAGGER: ‡ => e0 +à => 88 + +MIDDLE DOT: · => e1 +á => 87 + +SINGLE LOW-9 QUOTATION MARK: ‚ => e2 +â => 89 + +DOUBLE LOW-9 QUOTATION MARK: „ => e3 +ã => 8b + +PER MILLE SIGN: ‰ => e4 +ä => 8a + +LATIN CAPITAL LETTER A WITH CIRCUMFLEX:  => e5 +å => 8c + +LATIN CAPITAL LETTER E WITH CIRCUMFLEX: Ê => e6 +æ => be + +LATIN CAPITAL LETTER A WITH ACUTE: Á => e7 +ç => 8d + +LATIN CAPITAL LETTER E WITH DIAERESIS: Ë => e8 +è => 8f + +LATIN CAPITAL LETTER E WITH GRAVE: È => e9 +é => 8e + +LATIN CAPITAL LETTER I WITH ACUTE: Í => ea +ê => 90 + +LATIN CAPITAL LETTER I WITH CIRCUMFLEX: Î => eb +ë => 91 + +LATIN CAPITAL LETTER I WITH DIAERESIS: Ï => ec +ì => 93 + +LATIN CAPITAL LETTER I WITH GRAVE: Ì => ed +í => 92 + +LATIN CAPITAL LETTER O WITH ACUTE: Ó => ee +î => 94 + +LATIN CAPITAL LETTER O WITH CIRCUMFLEX: Ô => ef +ï => 95 + +Apple logo:  => f0 +ð => ð + +LATIN CAPITAL LETTER O WITH GRAVE: Ò => f1 +ñ => 96 + +LATIN CAPITAL LETTER U WITH ACUTE: Ú => f2 +ò => 98 + +LATIN CAPITAL LETTER U WITH CIRCUMFLEX: Û => f3 +ó => 97 + +LATIN CAPITAL LETTER U WITH GRAVE: Ù => f4 +ô => 99 + +LATIN SMALL LETTER DOTLESS I: ı => f5 +õ => 9b + +MODIFIER LETTER CIRCUMFLEX ACCENT: ˆ => f6 +ö => 9a + +SMALL TILDE: ˜ => f7 +÷ => d6 + +MACRON: ¯ => f8 +ø => bf + +BREVE: ˘ => f9 +ù => 9d + +DOT ABOVE: ˙ => fa +ú => 9c + +RING ABOVE: ˚ => fb +û => 9e + +CEDILLA: ¸ => fc +ü => 9f + +DOUBLE ACUTE ACCENT: ˝ => fd +ý => ý + +OGONEK: ˛ => fe +þ => þ + +CARON: ˇ => ff +ÿ => d8 + + diff --git a/ext/standard/tests/strings/html_entity_decode_win1251.phpt b/ext/standard/tests/strings/html_entity_decode_win1251.phpt new file mode 100644 index 0000000000..e47392623c --- /dev/null +++ b/ext/standard/tests/strings/html_entity_decode_win1251.phpt @@ -0,0 +1,537 @@ +--TEST-- +Translation of HTML entities for encoding WIN-1251 +--FILE-- +<?php +$arr = array( +0x0402 => array(0x80, "CYRILLIC CAPITAL LETTER DJE"), +0x0403 => array(0x81, "CYRILLIC CAPITAL LETTER GJE"), +0x201A => array(0x82, "SINGLE LOW-9 QUOTATION MARK"), +0x0453 => array(0x83, "CYRILLIC SMALL LETTER GJE"), +0x201E => array(0x84, "DOUBLE LOW-9 QUOTATION MARK"), +0x2026 => array(0x85, "HORIZONTAL ELLIPSIS"), +0x2020 => array(0x86, "DAGGER"), +0x2021 => array(0x87, "DOUBLE DAGGER"), +0x20AC => array(0x88, "EURO SIGN"), +0x2030 => array(0x89, "PER MILLE SIGN"), +0x0409 => array(0x8A, "CYRILLIC CAPITAL LETTER LJE"), +0x2039 => array(0x8B, "SINGLE LEFT-POINTING ANGLE QUOTATION MARK"), +0x040A => array(0x8C, "CYRILLIC CAPITAL LETTER NJE"), +0x040C => array(0x8D, "CYRILLIC CAPITAL LETTER KJE"), +0x040B => array(0x8E, "CYRILLIC CAPITAL LETTER TSHE"), +0x040F => array(0x8F, "CYRILLIC CAPITAL LETTER DZHE"), +0x0452 => array(0x90, "CYRILLIC SMALL LETTER DJE"), +0x2018 => array(0x91, "LEFT SINGLE QUOTATION MARK"), +0x2019 => array(0x92, "RIGHT SINGLE QUOTATION MARK"), +0x201C => array(0x93, "LEFT DOUBLE QUOTATION MARK"), +0x201D => array(0x94, "RIGHT DOUBLE QUOTATION MARK"), +0x2022 => array(0x95, "BULLET"), +0x2013 => array(0x96, "EN DASH"), +0x2014 => array(0x97, "EM DASH"), +//0x98 #UNDEFINED +0x2122 => array(0x99, "TRADE MARK SIGN"), +0x0459 => array(0x9A, "CYRILLIC SMALL LETTER LJE"), +0x203A => array(0x9B, "SINGLE RIGHT-POINTING ANGLE QUOTATION MARK"), +0x045A => array(0x9C, "CYRILLIC SMALL LETTER NJE"), +0x045C => array(0x9D, "CYRILLIC SMALL LETTER KJE"), +0x045B => array(0x9E, "CYRILLIC SMALL LETTER TSHE"), +0x045F => array(0x9F, "CYRILLIC SMALL LETTER DZHE"), +0x00A0 => array(0xA0, "NO-BREAK SPACE"), +0x040E => array(0xA1, "CYRILLIC CAPITAL LETTER SHORT U"), +0x045E => array(0xA2, "CYRILLIC SMALL LETTER SHORT U"), +0x0408 => array(0xA3, "CYRILLIC CAPITAL LETTER JE"), +0x00A4 => array(0xA4, "CURRENCY SIGN"), +0x0490 => array(0xA5, "CYRILLIC CAPITAL LETTER GHE WITH UPTURN"), +0x00A6 => array(0xA6, "BROKEN BAR"), +0x00A7 => array(0xA7, "SECTION SIGN"), +0x0401 => array(0xA8, "CYRILLIC CAPITAL LETTER IO"), +0x00A9 => array(0xA9, "COPYRIGHT SIGN"), +0x0404 => array(0xAA, "CYRILLIC CAPITAL LETTER UKRAINIAN IE"), +0x00AB => array(0xAB, "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK"), +0x00AC => array(0xAC, "NOT SIGN"), +0x00AD => array(0xAD, "SOFT HYPHEN"), +0x00AE => array(0xAE, "REGISTERED SIGN"), +0x0407 => array(0xAF, "CYRILLIC CAPITAL LETTER YI"), +0x00B0 => array(0xB0, "DEGREE SIGN"), +0x00B1 => array(0xB1, "PLUS-MINUS SIGN"), +0x0406 => array(0xB2, "CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I"), +0x0456 => array(0xB3, "CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I"), +0x0491 => array(0xB4, "CYRILLIC SMALL LETTER GHE WITH UPTURN"), +0x00B5 => array(0xB5, "MICRO SIGN"), +0x00B6 => array(0xB6, "PILCROW SIGN"), +0x00B7 => array(0xB7, "MIDDLE DOT"), +0x0451 => array(0xB8, "CYRILLIC SMALL LETTER IO"), +0x2116 => array(0xB9, "NUMERO SIGN"), +0x0454 => array(0xBA, "CYRILLIC SMALL LETTER UKRAINIAN IE"), +0x00BB => array(0xBB, "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK"), +0x0458 => array(0xBC, "CYRILLIC SMALL LETTER JE"), +0x0405 => array(0xBD, "CYRILLIC CAPITAL LETTER DZE"), +0x0455 => array(0xBE, "CYRILLIC SMALL LETTER DZE"), +0x0457 => array(0xBF, "CYRILLIC SMALL LETTER YI"), +0x0410 => array(0xC0, "CYRILLIC CAPITAL LETTER A"), +0x0411 => array(0xC1, "CYRILLIC CAPITAL LETTER BE"), +0x0412 => array(0xC2, "CYRILLIC CAPITAL LETTER VE"), +0x0413 => array(0xC3, "CYRILLIC CAPITAL LETTER GHE"), +0x0414 => array(0xC4, "CYRILLIC CAPITAL LETTER DE"), +0x0415 => array(0xC5, "CYRILLIC CAPITAL LETTER IE"), +0x0416 => array(0xC6, "CYRILLIC CAPITAL LETTER ZHE"), +0x0417 => array(0xC7, "CYRILLIC CAPITAL LETTER ZE"), +0x0418 => array(0xC8, "CYRILLIC CAPITAL LETTER I"), +0x0419 => array(0xC9, "CYRILLIC CAPITAL LETTER SHORT I"), +0x041A => array(0xCA, "CYRILLIC CAPITAL LETTER KA"), +0x041B => array(0xCB, "CYRILLIC CAPITAL LETTER EL"), +0x041C => array(0xCC, "CYRILLIC CAPITAL LETTER EM"), +0x041D => array(0xCD, "CYRILLIC CAPITAL LETTER EN"), +0x041E => array(0xCE, "CYRILLIC CAPITAL LETTER O"), +0x041F => array(0xCF, "CYRILLIC CAPITAL LETTER PE"), +0x0420 => array(0xD0, "CYRILLIC CAPITAL LETTER ER"), +0x0421 => array(0xD1, "CYRILLIC CAPITAL LETTER ES"), +0x0422 => array(0xD2, "CYRILLIC CAPITAL LETTER TE"), +0x0423 => array(0xD3, "CYRILLIC CAPITAL LETTER U"), +0x0424 => array(0xD4, "CYRILLIC CAPITAL LETTER EF"), +0x0425 => array(0xD5, "CYRILLIC CAPITAL LETTER HA"), +0x0426 => array(0xD6, "CYRILLIC CAPITAL LETTER TSE"), +0x0427 => array(0xD7, "CYRILLIC CAPITAL LETTER CHE"), +0x0428 => array(0xD8, "CYRILLIC CAPITAL LETTER SHA"), +0x0429 => array(0xD9, "CYRILLIC CAPITAL LETTER SHCHA"), +0x042A => array(0xDA, "CYRILLIC CAPITAL LETTER HARD SIGN"), +0x042B => array(0xDB, "CYRILLIC CAPITAL LETTER YERU"), +0x042C => array(0xDC, "CYRILLIC CAPITAL LETTER SOFT SIGN"), +0x042D => array(0xDD, "CYRILLIC CAPITAL LETTER E"), +0x042E => array(0xDE, "CYRILLIC CAPITAL LETTER YU"), +0x042F => array(0xDF, "CYRILLIC CAPITAL LETTER YA"), +0x0430 => array(0xE0, "CYRILLIC SMALL LETTER A"), +0x0431 => array(0xE1, "CYRILLIC SMALL LETTER BE"), +0x0432 => array(0xE2, "CYRILLIC SMALL LETTER VE"), +0x0433 => array(0xE3, "CYRILLIC SMALL LETTER GHE"), +0x0434 => array(0xE4, "CYRILLIC SMALL LETTER DE"), +0x0435 => array(0xE5, "CYRILLIC SMALL LETTER IE"), +0x0436 => array(0xE6, "CYRILLIC SMALL LETTER ZHE"), +0x0437 => array(0xE7, "CYRILLIC SMALL LETTER ZE"), +0x0438 => array(0xE8, "CYRILLIC SMALL LETTER I"), +0x0439 => array(0xE9, "CYRILLIC SMALL LETTER SHORT I"), +0x043A => array(0xEA, "CYRILLIC SMALL LETTER KA"), +0x043B => array(0xEB, "CYRILLIC SMALL LETTER EL"), +0x043C => array(0xEC, "CYRILLIC SMALL LETTER EM"), +0x043D => array(0xED, "CYRILLIC SMALL LETTER EN"), +0x043E => array(0xEE, "CYRILLIC SMALL LETTER O"), +0x043F => array(0xEF, "CYRILLIC SMALL LETTER PE"), +0x0440 => array(0xF0, "CYRILLIC SMALL LETTER ER"), +0x0441 => array(0xF1, "CYRILLIC SMALL LETTER ES"), +0x0442 => array(0xF2, "CYRILLIC SMALL LETTER TE"), +0x0443 => array(0xF3, "CYRILLIC SMALL LETTER U"), +0x0444 => array(0xF4, "CYRILLIC SMALL LETTER EF"), +0x0445 => array(0xF5, "CYRILLIC SMALL LETTER HA"), +0x0446 => array(0xF6, "CYRILLIC SMALL LETTER TSE"), +0x0447 => array(0xF7, "CYRILLIC SMALL LETTER CHE"), +0x0448 => array(0xF8, "CYRILLIC SMALL LETTER SHA"), +0x0449 => array(0xF9, "CYRILLIC SMALL LETTER SHCHA"), +0x044A => array(0xFA, "CYRILLIC SMALL LETTER HARD SIGN"), +0x044B => array(0xFB, "CYRILLIC SMALL LETTER YERU"), +0x044C => array(0xFC, "CYRILLIC SMALL LETTER SOFT SIGN"), +0x044D => array(0xFD, "CYRILLIC SMALL LETTER E"), +0x044E => array(0xFE, "CYRILLIC SMALL LETTER YU"), +0x044F => array(0xFF, "CYRILLIC SMALL LETTER YA"), +); + +$res = html_entity_decode("˜", ENT_QUOTES, 'WINDOWS-1251'); +echo "Special test for ˜ (shouldn't decode):\n"; +echo $res,"\n\n"; + +foreach ($arr as $u => $v) { + $ent = sprintf("&#x%X;", $u); + $res = html_entity_decode($ent, ENT_QUOTES, 'WINDOWS-1251'); + $d = unpack("H*", $res); + echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]); + + $ent = sprintf("&#x%X;", $v[0]); + $res = html_entity_decode($ent, ENT_QUOTES, 'WINDOWS-1251'); + if ($res[0] != "&" || $res[1] != "#") + $res = unpack("H*", $res)[1]; + echo sprintf("%s => %s\n\n", $ent, $res); +} +--EXPECT-- +Special test for ˜ (shouldn't decode): +˜ + +CYRILLIC CAPITAL LETTER DJE: Ђ => 80 +€ => € + +CYRILLIC CAPITAL LETTER GJE: Ѓ => 81 + =>  + +SINGLE LOW-9 QUOTATION MARK: ‚ => 82 +‚ => ‚ + +CYRILLIC SMALL LETTER GJE: ѓ => 83 +ƒ => ƒ + +DOUBLE LOW-9 QUOTATION MARK: „ => 84 +„ => „ + +HORIZONTAL ELLIPSIS: … => 85 +… => … + +DAGGER: † => 86 +† => † + +DOUBLE DAGGER: ‡ => 87 +‡ => ‡ + +EURO SIGN: € => 88 +ˆ => ˆ + +PER MILLE SIGN: ‰ => 89 +‰ => ‰ + +CYRILLIC CAPITAL LETTER LJE: Љ => 8a +Š => Š + +SINGLE LEFT-POINTING ANGLE QUOTATION MARK: ‹ => 8b +‹ => ‹ + +CYRILLIC CAPITAL LETTER NJE: Њ => 8c +Œ => Œ + +CYRILLIC CAPITAL LETTER KJE: Ќ => 8d + =>  + +CYRILLIC CAPITAL LETTER TSHE: Ћ => 8e +Ž => Ž + +CYRILLIC CAPITAL LETTER DZHE: Џ => 8f + =>  + +CYRILLIC SMALL LETTER DJE: ђ => 90 + =>  + +LEFT SINGLE QUOTATION MARK: ‘ => 91 +‘ => ‘ + +RIGHT SINGLE QUOTATION MARK: ’ => 92 +’ => ’ + +LEFT DOUBLE QUOTATION MARK: “ => 93 +“ => “ + +RIGHT DOUBLE QUOTATION MARK: ” => 94 +” => ” + +BULLET: • => 95 +• => • + +EN DASH: – => 96 +– => – + +EM DASH: — => 97 +— => — + +TRADE MARK SIGN: ™ => 99 +™ => ™ + +CYRILLIC SMALL LETTER LJE: љ => 9a +š => š + +SINGLE RIGHT-POINTING ANGLE QUOTATION MARK: › => 9b +› => › + +CYRILLIC SMALL LETTER NJE: њ => 9c +œ => œ + +CYRILLIC SMALL LETTER KJE: ќ => 9d + =>  + +CYRILLIC SMALL LETTER TSHE: ћ => 9e +ž => ž + +CYRILLIC SMALL LETTER DZHE: џ => 9f +Ÿ => Ÿ + +NO-BREAK SPACE:   => a0 +  => a0 + +CYRILLIC CAPITAL LETTER SHORT U: Ў => a1 +¡ => ¡ + +CYRILLIC SMALL LETTER SHORT U: ў => a2 +¢ => ¢ + +CYRILLIC CAPITAL LETTER JE: Ј => a3 +£ => £ + +CURRENCY SIGN: ¤ => a4 +¤ => a4 + +CYRILLIC CAPITAL LETTER GHE WITH UPTURN: Ґ => a5 +¥ => ¥ + +BROKEN BAR: ¦ => a6 +¦ => a6 + +SECTION SIGN: § => a7 +§ => a7 + +CYRILLIC CAPITAL LETTER IO: Ё => a8 +¨ => ¨ + +COPYRIGHT SIGN: © => a9 +© => a9 + +CYRILLIC CAPITAL LETTER UKRAINIAN IE: Є => aa +ª => ª + +LEFT-POINTING DOUBLE ANGLE QUOTATION MARK: « => ab +« => ab + +NOT SIGN: ¬ => ac +¬ => ac + +SOFT HYPHEN: ­ => ad +­ => ad + +REGISTERED SIGN: ® => ae +® => ae + +CYRILLIC CAPITAL LETTER YI: Ї => af +¯ => ¯ + +DEGREE SIGN: ° => b0 +° => b0 + +PLUS-MINUS SIGN: ± => b1 +± => b1 + +CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I: І => b2 +² => ² + +CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I: і => b3 +³ => ³ + +CYRILLIC SMALL LETTER GHE WITH UPTURN: ґ => b4 +´ => ´ + +MICRO SIGN: µ => b5 +µ => b5 + +PILCROW SIGN: ¶ => b6 +¶ => b6 + +MIDDLE DOT: · => b7 +· => b7 + +CYRILLIC SMALL LETTER IO: ё => b8 +¸ => ¸ + +NUMERO SIGN: № => b9 +¹ => ¹ + +CYRILLIC SMALL LETTER UKRAINIAN IE: є => ba +º => º + +RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK: » => bb +» => bb + +CYRILLIC SMALL LETTER JE: ј => bc +¼ => ¼ + +CYRILLIC CAPITAL LETTER DZE: Ѕ => bd +½ => ½ + +CYRILLIC SMALL LETTER DZE: ѕ => be +¾ => ¾ + +CYRILLIC SMALL LETTER YI: ї => bf +¿ => ¿ + +CYRILLIC CAPITAL LETTER A: А => c0 +À => À + +CYRILLIC CAPITAL LETTER BE: Б => c1 +Á => Á + +CYRILLIC CAPITAL LETTER VE: В => c2 + =>  + +CYRILLIC CAPITAL LETTER GHE: Г => c3 +à => à + +CYRILLIC CAPITAL LETTER DE: Д => c4 +Ä => Ä + +CYRILLIC CAPITAL LETTER IE: Е => c5 +Å => Å + +CYRILLIC CAPITAL LETTER ZHE: Ж => c6 +Æ => Æ + +CYRILLIC CAPITAL LETTER ZE: З => c7 +Ç => Ç + +CYRILLIC CAPITAL LETTER I: И => c8 +È => È + +CYRILLIC CAPITAL LETTER SHORT I: Й => c9 +É => É + +CYRILLIC CAPITAL LETTER KA: К => ca +Ê => Ê + +CYRILLIC CAPITAL LETTER EL: Л => cb +Ë => Ë + +CYRILLIC CAPITAL LETTER EM: М => cc +Ì => Ì + +CYRILLIC CAPITAL LETTER EN: Н => cd +Í => Í + +CYRILLIC CAPITAL LETTER O: О => ce +Î => Î + +CYRILLIC CAPITAL LETTER PE: П => cf +Ï => Ï + +CYRILLIC CAPITAL LETTER ER: Р => d0 +Ð => Ð + +CYRILLIC CAPITAL LETTER ES: С => d1 +Ñ => Ñ + +CYRILLIC CAPITAL LETTER TE: Т => d2 +Ò => Ò + +CYRILLIC CAPITAL LETTER U: У => d3 +Ó => Ó + +CYRILLIC CAPITAL LETTER EF: Ф => d4 +Ô => Ô + +CYRILLIC CAPITAL LETTER HA: Х => d5 +Õ => Õ + +CYRILLIC CAPITAL LETTER TSE: Ц => d6 +Ö => Ö + +CYRILLIC CAPITAL LETTER CHE: Ч => d7 +× => × + +CYRILLIC CAPITAL LETTER SHA: Ш => d8 +Ø => Ø + +CYRILLIC CAPITAL LETTER SHCHA: Щ => d9 +Ù => Ù + +CYRILLIC CAPITAL LETTER HARD SIGN: Ъ => da +Ú => Ú + +CYRILLIC CAPITAL LETTER YERU: Ы => db +Û => Û + +CYRILLIC CAPITAL LETTER SOFT SIGN: Ь => dc +Ü => Ü + +CYRILLIC CAPITAL LETTER E: Э => dd +Ý => Ý + +CYRILLIC CAPITAL LETTER YU: Ю => de +Þ => Þ + +CYRILLIC CAPITAL LETTER YA: Я => df +ß => ß + +CYRILLIC SMALL LETTER A: а => e0 +à => à + +CYRILLIC SMALL LETTER BE: б => e1 +á => á + +CYRILLIC SMALL LETTER VE: в => e2 +â => â + +CYRILLIC SMALL LETTER GHE: г => e3 +ã => ã + +CYRILLIC SMALL LETTER DE: д => e4 +ä => ä + +CYRILLIC SMALL LETTER IE: е => e5 +å => å + +CYRILLIC SMALL LETTER ZHE: ж => e6 +æ => æ + +CYRILLIC SMALL LETTER ZE: з => e7 +ç => ç + +CYRILLIC SMALL LETTER I: и => e8 +è => è + +CYRILLIC SMALL LETTER SHORT I: й => e9 +é => é + +CYRILLIC SMALL LETTER KA: к => ea +ê => ê + +CYRILLIC SMALL LETTER EL: л => eb +ë => ë + +CYRILLIC SMALL LETTER EM: м => ec +ì => ì + +CYRILLIC SMALL LETTER EN: н => ed +í => í + +CYRILLIC SMALL LETTER O: о => ee +î => î + +CYRILLIC SMALL LETTER PE: п => ef +ï => ï + +CYRILLIC SMALL LETTER ER: р => f0 +ð => ð + +CYRILLIC SMALL LETTER ES: с => f1 +ñ => ñ + +CYRILLIC SMALL LETTER TE: т => f2 +ò => ò + +CYRILLIC SMALL LETTER U: у => f3 +ó => ó + +CYRILLIC SMALL LETTER EF: ф => f4 +ô => ô + +CYRILLIC SMALL LETTER HA: х => f5 +õ => õ + +CYRILLIC SMALL LETTER TSE: ц => f6 +ö => ö + +CYRILLIC SMALL LETTER CHE: ч => f7 +÷ => ÷ + +CYRILLIC SMALL LETTER SHA: ш => f8 +ø => ø + +CYRILLIC SMALL LETTER SHCHA: щ => f9 +ù => ù + +CYRILLIC SMALL LETTER HARD SIGN: ъ => fa +ú => ú + +CYRILLIC SMALL LETTER YERU: ы => fb +û => û + +CYRILLIC SMALL LETTER SOFT SIGN: ь => fc +ü => ü + +CYRILLIC SMALL LETTER E: э => fd +ý => ý + +CYRILLIC SMALL LETTER YU: ю => fe +þ => þ + +CYRILLIC SMALL LETTER YA: я => ff +ÿ => ÿ + + diff --git a/ext/standard/tests/strings/html_entity_decode_win1252.phpt b/ext/standard/tests/strings/html_entity_decode_win1252.phpt new file mode 100644 index 0000000000..2a7a6981dc --- /dev/null +++ b/ext/standard/tests/strings/html_entity_decode_win1252.phpt @@ -0,0 +1,169 @@ +--TEST-- +Translation of HTML entities for encoding WIN-1252 +--FILE-- +<?php +$arr = array( +0x20AC => array(0x80, "EURO SIGN"), +//0x81 #UNDEFINED +0x201A => array(0x82, "SINGLE LOW-9 QUOTATION MARK"), +0x0192 => array(0x83, "LATIN SMALL LETTER F WITH HOOK"), +0x201E => array(0x84, "DOUBLE LOW-9 QUOTATION MARK"), +0x2026 => array(0x85, "HORIZONTAL ELLIPSIS"), +0x2020 => array(0x86, "DAGGER"), +0x2021 => array(0x87, "DOUBLE DAGGER"), +0x02C6 => array(0x88, "MODIFIER LETTER CIRCUMFLEX ACCENT"), +0x2030 => array(0x89, "PER MILLE SIGN"), +0x0160 => array(0x8A, "LATIN CAPITAL LETTER S WITH CARON"), +0x2039 => array(0x8B, "SINGLE LEFT-POINTING ANGLE QUOTATION MARK"), +0x0152 => array(0x8C, "LATIN CAPITAL LIGATURE OE"), +//0x8D #UNDEFINED +0x017D => array(0x8E, "LATIN CAPITAL LETTER Z WITH CARON"), +//0x8F #UNDEFINED +//0x90 #UNDEFINED +0x2018 => array(0x91, "LEFT SINGLE QUOTATION MARK"), +0x2019 => array(0x92, "RIGHT SINGLE QUOTATION MARK"), +0x201C => array(0x93, "LEFT DOUBLE QUOTATION MARK"), +0x201D => array(0x94, "RIGHT DOUBLE QUOTATION MARK"), +0x2022 => array(0x95, "BULLET"), +0x2013 => array(0x96, "EN DASH"), +0x2014 => array(0x97, "EM DASH"), +0x02DC => array(0x98, "SMALL TILDE"), +0x2122 => array(0x99, "TRADE MARK SIGN"), +0x0161 => array(0x9A, "LATIN SMALL LETTER S WITH CARON"), +0x203A => array(0x9B, "SINGLE RIGHT-POINTING ANGLE QUOTATION MARK"), +0x0153 => array(0x9C, "LATIN SMALL LIGATURE OE"), +//0x9D #UNDEFINED +0x017E => array(0x9E, "LATIN SMALL LETTER Z WITH CARON"), +0x0178 => array(0x9F, "LATIN CAPITAL LETTER Y WITH DIAERESIS"), +); + +$res = html_entity_decode("", ENT_QUOTES, 'WINDOWS-1252'); +echo "Special test for  (shouldn't decode):\n"; +echo $res,"\n\n"; + +$res = html_entity_decode("", ENT_QUOTES, 'WINDOWS-1252'); +echo "Special test for  (shouldn't decode):\n"; +echo $res,"\n\n"; + +$res = html_entity_decode("", ENT_QUOTES, 'WINDOWS-1252'); +echo "Special test for  (shouldn't decode):\n"; +echo $res,"\n\n"; + +$res = html_entity_decode("", ENT_QUOTES, 'WINDOWS-1252'); +echo "Special test for  (shouldn't decode):\n"; +echo $res,"\n\n"; + +$res = html_entity_decode("", ENT_QUOTES, 'WINDOWS-1252'); +echo "Special test for  (shouldn't decode):\n"; +echo $res,"\n\n"; + +foreach ($arr as $u => $v) { + $ent = sprintf("&#x%X;", $u); + $res = html_entity_decode($ent, ENT_QUOTES, 'WINDOWS-1252'); + $d = unpack("H*", $res); + echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]); + + $ent = sprintf("&#x%X;", $v[0]); + $res = html_entity_decode($ent, ENT_QUOTES, 'WINDOWS-1252'); + if ($res[0] != "&" || $res[1] != "#") + $res = unpack("H*", $res)[1]; + echo sprintf("%s => %s\n\n", $ent, $res); +} +--EXPECT-- +Special test for  (shouldn't decode): + + +Special test for  (shouldn't decode): + + +Special test for  (shouldn't decode): + + +Special test for  (shouldn't decode): + + +Special test for  (shouldn't decode): + + +EURO SIGN: € => 80 +€ => € + +SINGLE LOW-9 QUOTATION MARK: ‚ => 82 +‚ => ‚ + +LATIN SMALL LETTER F WITH HOOK: ƒ => 83 +ƒ => ƒ + +DOUBLE LOW-9 QUOTATION MARK: „ => 84 +„ => „ + +HORIZONTAL ELLIPSIS: … => 85 +… => … + +DAGGER: † => 86 +† => † + +DOUBLE DAGGER: ‡ => 87 +‡ => ‡ + +MODIFIER LETTER CIRCUMFLEX ACCENT: ˆ => 88 +ˆ => ˆ + +PER MILLE SIGN: ‰ => 89 +‰ => ‰ + +LATIN CAPITAL LETTER S WITH CARON: Š => 8a +Š => Š + +SINGLE LEFT-POINTING ANGLE QUOTATION MARK: ‹ => 8b +‹ => ‹ + +LATIN CAPITAL LIGATURE OE: Œ => 8c +Œ => Œ + +LATIN CAPITAL LETTER Z WITH CARON: Ž => 8e +Ž => Ž + +LEFT SINGLE QUOTATION MARK: ‘ => 91 +‘ => ‘ + +RIGHT SINGLE QUOTATION MARK: ’ => 92 +’ => ’ + +LEFT DOUBLE QUOTATION MARK: “ => 93 +“ => “ + +RIGHT DOUBLE QUOTATION MARK: ” => 94 +” => ” + +BULLET: • => 95 +• => • + +EN DASH: – => 96 +– => – + +EM DASH: — => 97 +— => — + +SMALL TILDE: ˜ => 98 +˜ => ˜ + +TRADE MARK SIGN: ™ => 99 +™ => ™ + +LATIN SMALL LETTER S WITH CARON: š => 9a +š => š + +SINGLE RIGHT-POINTING ANGLE QUOTATION MARK: › => 9b +› => › + +LATIN SMALL LIGATURE OE: œ => 9c +œ => œ + +LATIN SMALL LETTER Z WITH CARON: ž => 9e +ž => ž + +LATIN CAPITAL LETTER Y WITH DIAERESIS: Ÿ => 9f +Ÿ => Ÿ + + diff --git a/ext/standard/tests/strings/htmlentities17.phpt b/ext/standard/tests/strings/htmlentities17.phpt index b203e7c3e0..d9e67a9b87 100644 --- a/ext/standard/tests/strings/htmlentities17.phpt +++ b/ext/standard/tests/strings/htmlentities17.phpt @@ -3,7 +3,6 @@ htmlentities() / html_entity_decode() #8592 - #9002 table test --FILE-- <?php $tests = array( - array(8768, '≀', "e28980"), array(8853, '⊕', "e28a95"), array(8855, '⊗', "e28a97"), array(8869, '⊥', "e28aa5"), @@ -26,7 +25,6 @@ foreach ($tests as $test) { } ?> --EXPECT-- -string(8) "≀" string(7) "⊕" string(8) "⊗" string(6) "⊥" @@ -37,7 +35,6 @@ string(8) "⌊" string(8) "⌋" string(6) "⟨" string(6) "⟩" -string(6) "e28980" string(6) "e28a95" string(6) "e28a97" string(6) "e28aa5" |