summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ext/standard/basic_functions.c5
-rw-r--r--ext/standard/basic_functions.h4
-rw-r--r--ext/standard/html.c1133
-rw-r--r--ext/standard/html_tables.h2080
-rw-r--r--ext/standard/tests/strings/get_html_translation_table_basic1.phpt8
-rw-r--r--ext/standard/tests/strings/html_entity_decode_cp866.phpt533
-rw-r--r--ext/standard/tests/strings/html_entity_decode_iso8859-15.phpt405
-rw-r--r--ext/standard/tests/strings/html_entity_decode_iso8859-5.phpt405
-rw-r--r--ext/standard/tests/strings/html_entity_decode_koi8-r.phpt533
-rw-r--r--ext/standard/tests/strings/html_entity_decode_macroman.phpt540
-rw-r--r--ext/standard/tests/strings/html_entity_decode_win1251.phpt537
-rw-r--r--ext/standard/tests/strings/html_entity_decode_win1252.phpt169
-rw-r--r--ext/standard/tests/strings/htmlentities17.phpt3
13 files changed, 5690 insertions, 665 deletions
diff --git a/ext/standard/basic_functions.c b/ext/standard/basic_functions.c
index 96201955be..a70a5b222b 100644
--- a/ext/standard/basic_functions.c
+++ b/ext/standard/basic_functions.c
@@ -3432,6 +3432,7 @@ static void basic_globals_ctor(php_basic_globals *basic_globals_p TSRMLS_DC) /*
BG(left) = -1;
BG(user_tick_functions) = NULL;
BG(user_filter_map) = NULL;
+ BG(inverse_ent_maps) = NULL;
memset(&BG(serialize), 0, sizeof(BG(serialize)));
memset(&BG(unserialize), 0, sizeof(BG(unserialize)));
@@ -3454,6 +3455,10 @@ static void basic_globals_dtor(php_basic_globals *basic_globals_p TSRMLS_DC) /*
zend_hash_destroy(BG(url_adapt_state_ex).tags);
free(BG(url_adapt_state_ex).tags);
}
+ if (BG(inverse_ent_maps)) {
+ zend_hash_destroy(BG(inverse_ent_maps));
+ pefree(BG(inverse_ent_maps), 1);
+ }
}
/* }}} */
diff --git a/ext/standard/basic_functions.h b/ext/standard/basic_functions.h
index 4498e6cf8f..edc5846e0a 100644
--- a/ext/standard/basic_functions.h
+++ b/ext/standard/basic_functions.h
@@ -220,6 +220,10 @@ typedef struct _php_basic_globals {
HashTable *user_filter_map;
+ /* html.c */
+ /* map entities to characters. Stores hash table pointers for each charset */
+ HashTable *inverse_ent_maps;
+
/* file.c */
#if defined(_REENTRANT) && defined(HAVE_MBRLEN) && defined(HAVE_MBSTATE_T)
mbstate_t mblen_state;
diff --git a/ext/standard/html.c b/ext/standard/html.c
index 7a14f6b0ad..0ad34e52c4 100644
--- a/ext/standard/html.c
+++ b/ext/standard/html.c
@@ -14,7 +14,8 @@
+----------------------------------------------------------------------+
| Authors: Rasmus Lerdorf <rasmus@php.net> |
| Jaakko Hyvätti <jaakko.hyvatti@iki.fi> |
- | Wez Furlong <wez@thebrainroom.com> |
+ | Wez Furlong <wez@thebrainroom.com> |
+ | Gustavo Lopes <cataphract@php.net> |
+----------------------------------------------------------------------+
*/
@@ -28,7 +29,11 @@
* http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT
*
* http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2
- *
+ *
+ * From HTML 4.01 strict DTD:
+ * http://www.w3.org/TR/html4/HTMLlat1.ent
+ * http://www.w3.org/TR/html4/HTMLsymbol.ent
+ * http://www.w3.org/TR/html4/HTMLspecial.ent
*/
#include "php.h"
@@ -37,7 +42,7 @@
#else
#include <php_config.h>
#endif
-#include "html.h"
+#include "php_standard.h"
#include "php_string.h"
#include "SAPI.h"
#if HAVE_LOCALE_H
@@ -52,424 +57,8 @@
ZEND_EXTERN_MODULE_GLOBALS(mbstring)
#endif
-enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
- cs_8859_15, cs_utf_8, cs_big5, cs_gb2312,
- cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
- cs_cp1251, cs_8859_5, cs_cp866, cs_macroman
- };
-typedef const char *const entity_table_t;
-
-/* codepage 1252 is a Windows extension to iso-8859-1. */
-static entity_table_t ent_cp_1252[] = {
- "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger",
- "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
- NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo",
- "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
- "oelig", NULL, NULL, "Yuml"
-};
-
-static entity_table_t ent_iso_8859_1[] = {
- "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
- "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
- "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
- "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
- "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
- "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
- "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
- "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
- "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
- "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
- "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
- "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
- "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
- "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
- "uuml", "yacute", "thorn", "yuml"
-};
-
-static entity_table_t ent_iso_8859_15[] = {
- "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
- "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
- "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */
- "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm",
- "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
- "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
- "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
- "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
- "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
- "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
- "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
- "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
- "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
- "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
- "uuml", "yacute", "thorn", "yuml"
-};
-
-static entity_table_t ent_uni_338_402[] = {
- /* 338 (0x0152) */
- "OElig", "oelig", NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 352 (0x0160) */
- "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 376 (0x0178) */
- "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 400 (0x0190) */
- NULL, NULL, "fnof"
-};
-
-static entity_table_t ent_uni_spacing[] = {
- /* 710 */
- "circ",
- /* 711 - 730 */
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 731 - 732 */
- NULL, "tilde"
-};
-
-static entity_table_t ent_uni_greek[] = {
- /* 913 */
- "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
- "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
- NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
- /* 938 - 944 are not mapped */
- NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
- "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
- "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
- /* 970 - 976 are not mapped */
- NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- "thetasym", "upsih",
- NULL, NULL, NULL,
- "piv"
-};
-
-static entity_table_t ent_uni_punct[] = {
- /* 8194 */
- "ensp", "emsp", NULL, NULL, NULL, NULL, NULL,
- "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm",
- NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL,
- /* 8216 */
- "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL,
- "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip",
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL,
- /* 8242 */
- "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL,
- NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL,
- "frasl"
-};
-
-static entity_table_t ent_uni_euro[] = {
- "euro"
-};
-
-static entity_table_t ent_uni_8465_8501[] = {
- /* 8465 */
- "image", NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8472 */
- "weierp", NULL, NULL, NULL,
- /* 8476 */
- "real", NULL, NULL, NULL, NULL, NULL,
- /* 8482 */
- "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8501 */
- "alefsym",
-};
-
-static entity_table_t ent_uni_8592_9002[] = {
- /* 8592 (0x2190) */
- "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8608 (0x21a0) */
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8624 (0x21b0) */
- NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8640 (0x21c0) */
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8656 (0x21d0) */
- "lArr", "uArr", "rArr", "dArr", "hArr", NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8672 (0x21e0) */
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8704 (0x2200) */
- "forall", NULL, "part", "exist", NULL, "empty", NULL, "nabla",
- "isin", "notin", NULL, "ni", NULL, NULL, NULL, "prod",
- /* 8720 (0x2210) */
- NULL, "sum", "minus", NULL, NULL, NULL, NULL, "lowast",
- NULL, NULL, "radic", NULL, NULL, "prop", "infin", NULL,
- /* 8736 (0x2220) */
- "ang", NULL, NULL, NULL, NULL, NULL, NULL, "and",
- "or", "cap", "cup", "int", NULL, NULL, NULL, NULL,
- /* 8752 (0x2230) */
- NULL, NULL, NULL, NULL, "there4", NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, "sim", NULL, NULL, NULL,
- /* 8768 (0x2240) */
- NULL, NULL, NULL, NULL, NULL, "cong", NULL, NULL,
- "asymp", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8784 (0x2250) */
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8800 (0x2260) */
- "ne", "equiv", NULL, NULL, "le", "ge", NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8816 (0x2270) */
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8832 (0x2280) */
- NULL, NULL, "sub", "sup", "nsub", NULL, "sube", "supe",
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8848 (0x2290) */
- NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes",
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8864 (0x22a0) */
- NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8880 (0x22b0) */
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8896 (0x22c0) */
- NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8912 (0x22d0) */
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8928 (0x22e0) */
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8944 (0x22f0) */
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8960 (0x2300) */
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL,
- /* 8976 (0x2310) */
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- /* 8992 (0x2320) */
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, "lang", "rang"
-};
-
-static entity_table_t ent_uni_9674[] = {
- /* 9674 */
- "loz"
-};
-
-static entity_table_t ent_uni_9824_9830[] = {
- /* 9824 */
- "spades", NULL, NULL, "clubs", NULL, "hearts", "diams"
-};
-
-static entity_table_t ent_koi8r[] = {
- "#1105", /* "jo "*/
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092",
- "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084",
- "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090",
- "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096",
- "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041",
- "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048",
- "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055",
- "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042",
- "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063",
- "#1066"
-};
-
-static entity_table_t ent_cp_1251[] = {
- "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger",
- "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036",
- "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220",
- "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250",
- "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118",
- "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy",
- "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn",
- "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105",
- "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111",
- "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046",
- "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053",
- "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060",
- "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067",
- "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074",
- "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081",
- "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088",
- "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095",
- "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102",
- "#1103"
-};
-
-static entity_table_t ent_iso_8859_5[] = {
- "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062",
- "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069",
- "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076",
- "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083",
- "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090",
- "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
- "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104",
- "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111",
- "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118",
- "#1119"
-};
-
-static entity_table_t ent_cp_866[] = {
-
- "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566",
- "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552",
- "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560",
- "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608",
- "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090",
- "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
- "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025",
- "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118",
- "#176", "#8729", "#183", "#8730", "#8470", "#164", "#9632",
- "#160"
-};
-
-/* MacRoman has a couple of low-ascii chars that need mapping too */
-/* Vertical tab (ASCII 11) is often used to store line breaks inside */
-/* DB exports, this mapping changes it to a space */
-static entity_table_t ent_macroman[] = {
- "sp", NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, "quot", NULL,
- NULL, NULL, "amp", NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, "lt", NULL, "gt", NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml",
- "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring",
- "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave",
- "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml",
- "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg",
- "cent", "pound", "sect", "bull", "para", "szlig", "reg",
- "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash",
- "infin", "plusmn", "le", "ge", "yen", "micro", "part",
- "sum", "prod", "pi", "int", "ordf", "ordm", "Omega",
- "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof",
- "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave",
- "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo",
- "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml",
- "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger",
- "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute",
- "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute",
- "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305",
- "circ", "tilde", "macr", "#728", "#729", "#730", "cedil",
- "#733", "#731", "#711"
-};
-
-struct html_entity_map {
- enum entity_charset charset; /* charset identifier */
- unsigned int basechar; /* char code at start of table */
- unsigned int endchar; /* last char code in the table */
- entity_table_t *table; /* the table of mappings */
-};
-
-static const struct html_entity_map entity_map[] = {
- { cs_cp1252, 0x80, 0x9f, ent_cp_1252 },
- { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 },
- { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 },
- { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 },
- { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 },
- { cs_utf_8, 338, 402, ent_uni_338_402 },
- { cs_utf_8, 710, 732, ent_uni_spacing },
- { cs_utf_8, 913, 982, ent_uni_greek },
- { cs_utf_8, 8194, 8260, ent_uni_punct },
- { cs_utf_8, 8364, 8364, ent_uni_euro },
- { cs_utf_8, 8465, 8501, ent_uni_8465_8501 },
- { cs_utf_8, 8592, 9002, ent_uni_8592_9002 },
- { cs_utf_8, 9674, 9674, ent_uni_9674 },
- { cs_utf_8, 9824, 9830, ent_uni_9824_9830 },
- { cs_big5, 0xa0, 0xff, ent_iso_8859_1 },
- { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 },
- { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 },
- { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 },
- { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 },
- { cs_koi8r, 0xa3, 0xff, ent_koi8r },
- { cs_cp1251, 0x80, 0xff, ent_cp_1251 },
- { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 },
- { cs_cp866, 0xc0, 0xff, ent_cp_866 },
- { cs_macroman, 0x0b, 0xff, ent_macroman },
- { cs_terminator }
-};
-
-static const struct {
- const char *codeset;
- enum entity_charset charset;
-} charset_map[] = {
- { "ISO-8859-1", cs_8859_1 },
- { "ISO8859-1", cs_8859_1 },
- { "ISO-8859-15", cs_8859_15 },
- { "ISO8859-15", cs_8859_15 },
- { "utf-8", cs_utf_8 },
- { "cp1252", cs_cp1252 },
- { "Windows-1252", cs_cp1252 },
- { "1252", cs_cp1252 },
- { "BIG5", cs_big5 },
- { "950", cs_big5 },
- { "GB2312", cs_gb2312 },
- { "936", cs_gb2312 },
- { "BIG5-HKSCS", cs_big5hkscs },
- { "Shift_JIS", cs_sjis },
- { "SJIS", cs_sjis },
- { "932", cs_sjis },
- { "EUCJP", cs_eucjp },
- { "EUC-JP", cs_eucjp },
- { "KOI8-R", cs_koi8r },
- { "koi8-ru", cs_koi8r },
- { "koi8r", cs_koi8r },
- { "cp1251", cs_cp1251 },
- { "Windows-1251", cs_cp1251 },
- { "win-1251", cs_cp1251 },
- { "iso8859-5", cs_8859_5 },
- { "iso-8859-5", cs_8859_5 },
- { "cp866", cs_cp866 },
- { "866", cs_cp866 },
- { "ibm866", cs_cp866 },
- { "MacRoman", cs_macroman },
- { NULL }
-};
-
-static const struct {
- unsigned short charcode;
- char *entity;
- int entitylen;
- int flags;
-} basic_entities[] = {
- { '"', "&quot;", 6, ENT_HTML_QUOTE_DOUBLE },
- { '\'', "&#039;", 6, ENT_HTML_QUOTE_SINGLE },
- { '\'', "&#39;", 5, ENT_HTML_QUOTE_SINGLE },
- { '<', "&lt;", 4, 0 },
- { '>', "&gt;", 4, 0 },
- { 0, NULL, 0, 0 }
-};
-
-struct basic_entities_dec {
- unsigned short charcode;
- char entity[8];
- int entitylen;
-};
-
+#include "html_tables.h"
+
#define MB_RETURN { \
*newpos = pos; \
mbseq[mbpos] = '\0'; \
@@ -871,6 +460,8 @@ size_t php_utf32_utf8(unsigned char *buf, int k)
{
size_t retval = 0;
+ /* assert(0x0 <= k <= 0x10FFFF); */
+
if (k < 0x80) {
buf[0] = k;
retval = 1;
@@ -883,226 +474,492 @@ size_t php_utf32_utf8(unsigned char *buf, int k)
buf[1] = 0x80 | ((k >> 6) & 0x3f);
buf[2] = 0x80 | (k & 0x3f);
retval = 3;
- } else if (k < 0x200000) {
+ } else {
buf[0] = 0xf0 | (k >> 18);
buf[1] = 0x80 | ((k >> 12) & 0x3f);
buf[2] = 0x80 | ((k >> 6) & 0x3f);
buf[3] = 0x80 | (k & 0x3f);
retval = 4;
- } else if (k < 0x4000000) {
- buf[0] = 0xf8 | (k >> 24);
- buf[1] = 0x80 | ((k >> 18) & 0x3f);
- buf[2] = 0x80 | ((k >> 12) & 0x3f);
- buf[3] = 0x80 | ((k >> 6) & 0x3f);
- buf[4] = 0x80 | (k & 0x3f);
- retval = 5;
- } else {
- buf[0] = 0xfc | (k >> 30);
- buf[1] = 0x80 | ((k >> 24) & 0x3f);
- buf[2] = 0x80 | ((k >> 18) & 0x3f);
- buf[3] = 0x80 | ((k >> 12) & 0x3f);
- buf[4] = 0x80 | ((k >> 6) & 0x3f);
- buf[5] = 0x80 | (k & 0x3f);
- retval = 6;
}
- buf[retval] = '\0';
+ /* UTF-8 has been restricted to max 4 bytes since RFC 3629 */
return retval;
}
/* }}} */
-/* {{{ php_unescape_html_entities
+/* {{{ unimap_bsearc_cmp
+ * Binary search of unicode code points in unicode <--> charset mapping.
+ * Returns the code point in the target charset (whose mapping table was given) or 0 if
+ * the unicode code point is not in the table.
*/
-PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
+static unsigned char unimap_bsearch(const unicode_mapping *table, unsigned code_key_a, size_t num)
{
- int retlen, j;
- unsigned int k;
- char *replaced, *ret, *p, *q, *lim, *next;
- enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
- unsigned char replacement[15];
- int replacement_len;
+ const unicode_mapping *l = table,
+ *h = &table[num-1],
+ *m;
+ unsigned short code_key;
+
+ /* we have no mappings outside the BMP */
+ if (code_key_a > 0xFFFFU)
+ return 0;
+
+ code_key = (unsigned short) code_key_a;
+
+ while (l <= h) {
+ m = l + (h - l) / 2;
+ if (code_key < m->un_code_point)
+ h = m - 1;
+ else if (code_key > m->un_code_point)
+ l = m + 1;
+ else
+ return m->cs_code;
+ }
+ return 0;
+}
+/* }}} */
- ret = estrndup(old, oldlen);
- retlen = oldlen;
- if (!retlen) {
- goto empty_source;
+/* {{{ map_from_unicode */
+static int map_from_unicode(unsigned code, enum entity_charset charset, unsigned *res)
+{
+ unsigned char found;
+ const unicode_mapping *table;
+ size_t table_size;
+
+ switch (charset) {
+ case cs_8859_1:
+ /* identity mapping of code points to unicode */
+ if (code > 0xFF) {
+ return FAILURE;
+ }
+ *res = code;
+ break;
+
+ case cs_8859_5:
+ if (code <= 0xA0 || code == 0xAD /* soft hyphen */) {
+ *res = code;
+ } else if (code == 0x2116) {
+ *res = 0xF0; /* numero sign */
+ } else if (code == 0xA7) {
+ *res = 0xFD; /* section sign */
+ } else if (code >= 0x0401 && code <= 0x044F) {
+ if (code == 0x040D || code == 0x0450 || code == 0x045D)
+ return FAILURE;
+ *res = code - 0x360;
+ } else {
+ return FAILURE;
+ }
+ break;
+
+ case cs_8859_15:
+ if (code < 0xA4 || (code > 0xBE && code <= 0xFF)) {
+ *res = code;
+ } else { /* between A4 and 0xBE */
+ found = unimap_bsearch(unimap_iso885915,
+ code, sizeof(unimap_iso885915) / sizeof(*unimap_iso885915));
+ if (found)
+ *res = found;
+ else
+ return FAILURE;
+ }
+ break;
+
+ case cs_cp1252:
+ if (code <= 0x7F || (code >= 0xA0 && code <= 0xFF)) {
+ *res = code;
+ } else {
+ found = unimap_bsearch(unimap_win1252,
+ code, sizeof(unimap_win1252) / sizeof(*unimap_win1252));
+ if (found)
+ *res = found;
+ else
+ return FAILURE;
+ }
+ break;
+
+ case cs_macroman:
+ if (code == 0x7F)
+ return FAILURE;
+ table = unimap_macroman;
+ table_size = sizeof(unimap_macroman) / sizeof(*unimap_macroman);
+ goto table_over_7F;
+ case cs_cp1251:
+ table = unimap_win1251;
+ table_size = sizeof(unimap_win1251) / sizeof(*unimap_win1251);
+ goto table_over_7F;
+ case cs_koi8r:
+ table = unimap_koi8r;
+ table_size = sizeof(unimap_koi8r) / sizeof(*unimap_koi8r);
+ goto table_over_7F;
+ case cs_cp866:
+ table = unimap_cp866;
+ table_size = sizeof(unimap_cp866) / sizeof(*unimap_cp866);
+
+table_over_7F:
+ if (code <= 0x7F) {
+ *res = code;
+ } else {
+ found = unimap_bsearch(table, code, table_size);
+ if (found)
+ *res = found;
+ else
+ return FAILURE;
+ }
+ break;
+
+ /* from here on, only map the possible characters in the ASCII range.
+ * to improve support here, it's a matter of building the unicode mappings.
+ * See <http://www.unicode.org/Public/6.0.0/ucd/Unihan.zip> */
+ case cs_sjis:
+ case cs_eucjp:
+ if (code >= 0x20 && code <= 0x7D) {
+ if (code == 0x5C) /* 0x5C is mapped to the yen symbol */
+ return FAILURE;
+ *res = code;
+ } else {
+ return FAILURE;
+ }
+ break;
+
+ case cs_big5:
+ case cs_big5hkscs:
+ case cs_gb2312:
+ if (code >= 0x20 && code <= 0x7D) {
+ *res = code;
+ } else {
+ return FAILURE;
+ }
+ break;
+
+ default:
+ return FAILURE;
}
-
- if (all) {
- /* look for a match in the maps for this charset */
- for (j = 0; entity_map[j].charset != cs_terminator; j++) {
- if (entity_map[j].charset != charset)
- continue;
- for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) {
- unsigned char entity[32];
- int entity_length = 0;
+ return SUCCESS;
+}
+/* }}} */
- if (entity_map[j].table[k - entity_map[j].basechar] == NULL)
- continue;
+/* {{{ process_numeric_entity
+ * Auxiliary function to traverse_for_entities.
+ * On input, *buf should point to the first character after # and on output, it's the last
+ * byte read, no matter if there was success or insuccess.
+ */
+static int process_numeric_entity(char **buf, unsigned *code_point, int all)
+{
+ long code_l;
+ int hexadecimal = (**buf == 'x' || **buf == 'X');
- entity_length = slprintf(entity, sizeof(entity), "&%s;", entity_map[j].table[k - entity_map[j].basechar]);
- if (entity_length >= sizeof(entity)) {
- continue;
- }
+ if (hexadecimal)
+ (*buf)++;
+
+ /* strtol allows whitespace and other stuff in the beginning
+ * we're not interested */
+ if (hexadecimal && !isxdigit(**buf) ||
+ !hexadecimal && !isdigit(**buf)) {
+ return FAILURE;
+ }
- /* When we have MBCS entities in the tables above, this will need to handle it */
- replacement_len = 0;
- switch (charset) {
- case cs_8859_1:
- case cs_cp1252:
- case cs_8859_15:
- case cs_cp1251:
- case cs_8859_5:
- case cs_cp866:
- case cs_koi8r:
- replacement[0] = k;
- replacement[1] = '\0';
- replacement_len = 1;
- break;
+ code_l = strtol(*buf, buf, hexadecimal ? 16 : 10);
- case cs_big5:
- case cs_gb2312:
- case cs_big5hkscs:
- case cs_sjis:
- case cs_eucjp:
- /* we cannot properly handle those multibyte encodings
- * with php_str_to_str. skip it. */
- continue;
+ if (**buf != ';')
+ return FAILURE;
- case cs_utf_8:
- replacement_len = php_utf32_utf8(replacement, k);
- break;
+ /* many more are invalid, but that depends on whether it's HTML
+ * (and which version) or XML. Rejecting 0 is handy because that's
+ * the return of strtol if no character was read */
+ if (code_l <= 0L || code_l > 0x10FFFFL)
+ return FAILURE;
+
+ *code_point = (unsigned)code_l;
- default:
- php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!");
- efree(ret);
- return NULL;
- }
+ if (!all) {
+ if (*code_point != '\'' && *code_point != '"')
+ return FAILURE;
+ }
- if (php_memnstr(ret, entity, entity_length, ret+retlen)) {
- replaced = php_str_to_str(ret, retlen, entity, entity_length, replacement, replacement_len, &retlen);
- efree(ret);
- ret = replaced;
- }
- }
- }
+ return SUCCESS;
+}
+/* }}} */
+
+/* {{{ process_named_entity */
+static int process_named_entity(char **buf, unsigned *code_unit_seq, HashTable *inv_map)
+{
+ size_t length;
+ char *start = *buf;
+ unsigned *stored_code;
+
+ /* "&" is represented by a 0x26 in all supported encodings. That means
+ * the byte after represents a character or is the leading byte of an
+ * sequence of 8-bit code units. If in the ranges below, it represents
+ * necessarily a alpha character because none of the supported encodings
+ * has an overlap with ASCII in the leading byte (only on the second one) */
+ while (**buf >= 'a' && **buf <= 'z' ||
+ **buf >= 'A' && **buf <= 'Z' ||
+ **buf >= '0' && **buf <= '9') {
+ (*buf)++;
}
- for (j = 0; basic_entities[j].charcode != 0; j++) {
+ if (**buf != ';')
+ return FAILURE;
+
+ /* cast to size_t OK as the quantity is always non-negative */
+ length = *buf - start;
+ if (length == 0 || length > 31) /* 31 is arbitrary */
+ return FAILURE;
+
+ if (zend_hash_find(inv_map, start, (uint)length, (void**)&stored_code) == FAILURE)
+ return FAILURE;
+
+ *code_unit_seq = *stored_code;
+
+ return SUCCESS;
+}
+/* }}} */
+
+/* {{{ traverse_for_entities
+ * Auxiliary function to php_unescape_html_entities().
+ * - The argument "all" determines if all numeric entities are decode or only those
+ * that correspond to quotes (depending on quote_style). Typically used with the inv_map
+ * stored under the key 0 in BG(inverse_ent_maps).
+ * - Using cs_terminator as charset is legal and has the effect of defaulting to UTF-8. Used
+ * when the encoding doesn't (or shouldn't...) matter.
+ */
+static void traverse_for_entities(char *ret, int *retlen_p, int all, int quote_style, HashTable *inv_map, enum entity_charset charset)
+{
+ int retlen;
+ char *p, *q, *lim;
+
+ /* note: this function assumes the entities always take equal or more space
+ * than the characters they represent in whatever supported external encoding.
+ * The supported encoding that can generate the longest code unit sequences is
+ * UTF-8 (4 bytes). Theoretically, there could be entities with only 3 chars
+ * (e.g. &z;) that would map to outside-the-BMP unicode code points and hence
+ * needed 4 bytes and would overflow, but we have no such thing. */
+
+ if (charset == cs_terminator) /* caller doesn't care; we choose one */
+ charset = cs_utf_8;
+
+ retlen = *retlen_p;
- if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
+ lim = ret + retlen; /* terminator address */
+ assert(*lim == '\0');
+
+ for (p = ret, q = ret; p < lim;) {
+ unsigned code;
+ char *next = NULL;
+ /* code is unicode code point or a set of 8-bit code units packed into
+ * an integer with the least significant bit being the last byte? */
+ int unicode;
+
+ /* Shift JIS, Big5 and HKSCS use multi-byte encodings where an
+ * ASCII range byte can be part of a multi-byte sequence.
+ * However, they start at 0x40, therefore if we find a 0x26 byte,
+ * we're sure it represents the '&' character. */
+
+ /* assumes there are no single-char entities */
+ if (p[0] != '&' || (p + 3 >= lim)) {
+ *(q++) = *(p++);
continue;
+ }
+
+ /* now p[3] is surely valid and is no terminator */
+
+ /* numerical entity */
+ if (p[1] == '#') {
+ next = &p[2];
+ if (process_numeric_entity(&next, &code, all) == FAILURE)
+ goto invalid_code;
+ unicode = 1;
+ } else if (inv_map != NULL) {
+ next = &p[1];
+ if (process_named_entity(&next, &code, inv_map) == FAILURE)
+ goto invalid_code;
+ unicode = 0;
+ } else {
+ goto invalid_code;
+ }
- replacement[0] = (unsigned char)basic_entities[j].charcode;
- replacement[1] = '\0';
+ assert(*next == ';');
+
+ if (code == '\'' && !(quote_style & ENT_HTML_QUOTE_SINGLE) ||
+ code == '"' && !(quote_style & ENT_HTML_QUOTE_DOUBLE))
+ goto invalid_code;
+
+ if (unicode && charset != cs_utf_8) {
+ /* replace unicode code point */
+ if (map_from_unicode(code, charset, &code) == FAILURE)
+ goto invalid_code; /* not representable in target charset */
+ }
+
+ switch (charset) {
+ case cs_utf_8:
+ {
+ size_t written;
+ written = php_utf32_utf8((unsigned char*)q, code);
+ q += written;
+ /* Since we're writing in place, we hope we didn't write more than we read */
+ assert(written <= (size_t)(next - p) + 1);
+ break;
+ }
+
+ case cs_8859_1:
+ case cs_cp1252:
+ case cs_8859_15:
+ case cs_koi8r:
+ case cs_cp1251:
+ case cs_8859_5:
+ case cs_cp866:
+ case cs_macroman:
+ /* single byte stuff */
+ *(q++) = code;
+ break;
+
+ case cs_big5:
+ case cs_big5hkscs:
+ case cs_sjis:
+ case cs_gb2312:
+ /* one or two bytes */
+ *(q++) = (code & 0xFFU);
+ if (0xFF00U & code) { /* 2 */
+ *(q++) = (code >> 8);
+ }
+ break;
+
+ case cs_eucjp:
+ /* one to three bytes */
+ *(q++) = code & 0xFFU;
+ if (0xFFFF00U & code) { /* 2 */
+ *(q++) = ((code >> 8) & 0xFFU);
+ if (0xFF0000U & code) /* 3 */
+ *(q++) = (code >> 16);
+ }
+ break;
+
+ default:
+ /* for backwards compatilibity */
+ goto invalid_code;
+ break;
+ }
+
+ /* jump over the valid entity; may go beyond size of buffer; np */
+ p = next + 1;
+ continue;
- if (php_memnstr(ret, basic_entities[j].entity, basic_entities[j].entitylen, ret+retlen)) {
- replaced = php_str_to_str(ret, retlen, basic_entities[j].entity, basic_entities[j].entitylen, replacement, 1, &retlen);
- efree(ret);
- ret = replaced;
+invalid_code:
+ for (; p < next; p++) {
+ *(q++) = *p;
}
}
+
+ *q = '\0';
+ *retlen_p = (size_t)(q - ret);
+}
+/* }}} */
- /* replace numeric entities & "&amp;" */
- lim = ret + retlen;
- for (p = ret, q = ret; p < lim;) {
- int code;
+/* {{{ inv_ent_maps_dtor
+ * Hash table destructor for BG(inverse_ent_maps)
+ */
+static void inv_ent_maps_dtor(HashTable **ht) {
+ zend_hash_destroy(*ht);
+ pefree(*ht, 1);
+}
+/* }}} */
- if (p[0] == '&') {
- if (p + 2 < lim) {
- if (p[1] == '#') {
- int invalid_code = 0;
+/* {{{ unescape_inverse_map
+ * Auxiliary function to php_unescape_html_entities()
+ * charset can be cs_terminator for only basic entities.
+ */
+static HashTable *unescape_inverse_map(enum entity_charset charset TSRMLS_DC)
+{
+ HashTable **inverse_map;
- if (p[2] == 'x' || p[2] == 'X') {
- code = strtol(p + 3, &next, 16);
- } else {
- code = strtol(p + 2, &next, 10);
- }
+ /* we accept charset = cs_terminator (for specialchars) */
- if (code == '\'' && !(quote_style & ENT_HTML_QUOTE_SINGLE) ||
- code == '"' && !(quote_style & ENT_HTML_QUOTE_DOUBLE)) {
- invalid_code = 1;
- }
+ if (!BG(inverse_ent_maps)) {
+ BG(inverse_ent_maps) = pemalloc(sizeof *BG(inverse_ent_maps), 1);
+ zend_hash_init(BG(inverse_ent_maps), cs_numelems, NULL, (dtor_func_t)inv_ent_maps_dtor, 1);
+ }
+ if (zend_hash_index_find(BG(inverse_ent_maps), (ulong)charset, (void**)&inverse_map) == FAILURE) {
+ HashTable *ht = pemalloc(sizeof *ht, 1);
+ uint capacity = 0;
+ int j, t;
- if (next != NULL && *next == ';' && !invalid_code) {
- switch (charset) {
- case cs_utf_8:
- q += php_utf32_utf8(q, code);
- break;
-
- case cs_8859_1:
- case cs_8859_5:
- case cs_8859_15:
- if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
- invalid_code = 1;
- } else {
- *(q++) = code;
- }
- break;
+ /* determine upper bound for capacity of hashtable */
+ for (j = 0; entity_map[j].charset != cs_terminator; j++) {
+ if (entity_map[j].charset == charset)
+ capacity += entity_map[j].endchar - entity_map[j].basechar + 1;
+ }
- case cs_cp1252:
- if (code > 0xff) {
- invalid_code = 1;
- } else {
- *(q++) = code;
- }
- break;
-
- case cs_cp1251:
- case cs_cp866:
- case cs_big5:
- case cs_big5hkscs:
- case cs_sjis:
- case cs_eucjp:
- if (code >= 0x80) {
- invalid_code = 1;
- } else {
- *(q++) = code;
- }
- break;
+ /* no destructor as we'll be storing ints */
+ zend_hash_init(ht, capacity, NULL, NULL, 1);
- case cs_gb2312:
- if (code >= 0x81) {
- invalid_code = 1;
- } else {
- *(q++) = code;
- }
- break;
+ /* store new hash table */
+ t = zend_hash_index_update(BG(inverse_ent_maps), (ulong)charset, &ht, sizeof(ht), (void**)&inverse_map);
+ assert(t == SUCCESS);
- default:
- /* for backwards compatilibity */
- invalid_code = 1;
- break;
- }
- if (invalid_code) {
- for (; p <= next; p++) {
- *(q++) = *p;
- }
- }
- p = next + 1;
- } else {
- *(q++) = *(p++);
- *(q++) = *(p++);
- }
- } else if (p + 4 < lim &&
- p[1] == 'a' && p[2] == 'm' &&p[3] == 'p' &&
- p[4] == ';') {
- *(q++) = '&';
- p += 5;
- } else {
- *(q++) = *(p++);
- *(q++) = *(p++);
- }
- } else {
- *(q++) = *(p++);
+ /* build inverse map */
+ for (j = 0; entity_map[j].charset != cs_terminator; j++) {
+ unsigned k;
+
+ if (entity_map[j].charset != charset)
+ continue;
+
+ for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) {
+ unsigned table_offset = k - entity_map[j].basechar;
+ const char* entity_name = entity_map[j].table[table_offset];
+
+ if (entity_name == NULL || *entity_name == '#')
+ continue;
+
+ t = zend_hash_update(ht, entity_name, strlen(entity_name), &k, sizeof(k), NULL);
+ assert(t == SUCCESS);
}
- } else {
- *(q++) = *(p++);
+ }
+
+ /* and add the basic entitites */
+ for (j = 0; basic_entities_ex[j].charcode != 0; j++) {
+ const basic_entity_t *ent = &basic_entities_ex[j];
+ unsigned k = ent->charcode;
+
+ t = zend_hash_update(ht, &ent->entity[1] /* skip & */,
+ ent->entitylen - 2 /* skip & and ; */, &k, sizeof(k), NULL);
+ assert(t == SUCCESS);
}
}
- *q = '\0';
- retlen = (size_t)(q - ret);
+
+ return *inverse_map;
+}
+
+/* {{{ php_unescape_html_entities
+ * The parameter "all" should be true to decode all possible entities, false to decode
+ * only the basic ones, i.e., those in basic_entities_ex + the numeric entities
+ * that correspond to quotes.
+ */
+PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
+{
+ int retlen;
+ char *ret;
+ enum entity_charset charset;
+ HashTable *inverse_map = NULL;
+
+ if (all) {
+ charset = determine_charset(hint_charset TSRMLS_CC);
+ } else {
+ charset = cs_terminator;
+ }
+
+ ret = estrndup(old, oldlen);
+ retlen = oldlen;
+ if (retlen == 0) {
+ goto empty_source;
+ }
+
+ /* charset == cs_terminator if !all */
+ inverse_map = unescape_inverse_map(charset TSRMLS_CC);
+
+ /* replace numeric entities */
+ /* !all implies charset == cs_terminator && inverse_map == BG(inverse_ent_maps)[0] */
+ traverse_for_entities(ret, &retlen, all, quote_style, inverse_map, charset);
+
empty_source:
*newlen = retlen;
return ret;
@@ -1315,65 +1172,20 @@ PHP_FUNCTION(htmlspecialchars)
Convert special HTML entities back to characters */
PHP_FUNCTION(htmlspecialchars_decode)
{
- char *str, *new_str, *e, *p;
- int len, j, i, new_len;
+ char *str;
+ int str_len, len;
long quote_style = ENT_COMPAT;
- struct basic_entities_dec basic_entities_dec[8];
+ char *replaced;
- if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, &quote_style) == FAILURE) {
+ if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len, &quote_style) == FAILURE) {
return;
}
- new_str = estrndup(str, len);
- new_len = len;
- e = new_str + new_len;
-
- if (!(p = memchr(new_str, '&', new_len))) {
- RETURN_STRINGL(new_str, new_len, 0);
- }
-
- for (j = 0, i = 0; basic_entities[i].charcode != 0; i++) {
- if (basic_entities[i].flags && !(quote_style & basic_entities[i].flags)) {
- continue;
- }
- basic_entities_dec[j].charcode = basic_entities[i].charcode;
- memcpy(basic_entities_dec[j].entity, basic_entities[i].entity, basic_entities[i].entitylen + 1);
- basic_entities_dec[j].entitylen = basic_entities[i].entitylen;
- j++;
+ replaced = php_unescape_html_entities(str, str_len, &len, 0 /*!all*/, quote_style, NULL TSRMLS_CC);
+ if (replaced) {
+ RETURN_STRINGL(replaced, len, 0);
}
- basic_entities_dec[j].charcode = '&';
- basic_entities_dec[j].entitylen = sizeof("&amp;") - 1;
- memcpy(basic_entities_dec[j].entity, "&amp;", sizeof("&amp;"));
- i = j + 1;
-
- do {
- int l = e - p;
-
- for (j = 0; j < i; j++) {
- if (basic_entities_dec[j].entitylen > l) {
- continue;
- }
- if (!memcmp(p, basic_entities_dec[j].entity, basic_entities_dec[j].entitylen)) {
- int e_len = basic_entities_dec[j].entitylen - 1;
-
- *p++ = (char) basic_entities_dec[j].charcode;
- memmove(p, p + e_len, (e - p - e_len));
- e -= e_len;
- goto done;
- }
- }
- p++;
-
-done:
- if (p >= e) {
- break;
- }
- } while ((p = memchr(p, '&', (e - p))));
-
- new_len = e - new_str;
-
- new_str[new_len] = '\0';
- RETURN_STRINGL(new_str, new_len, 0);
+ RETURN_FALSE;
}
/* }}} */
@@ -1391,7 +1203,7 @@ PHP_FUNCTION(html_entity_decode)
return;
}
- replaced = php_unescape_html_entities(str, str_len, &len, 1, quote_style, hint_charset TSRMLS_CC);
+ replaced = php_unescape_html_entities(str, str_len, &len, 1 /*all*/, quote_style, hint_charset TSRMLS_CC);
if (replaced) {
RETURN_STRINGL(replaced, len, 0);
}
@@ -1446,15 +1258,20 @@ PHP_FUNCTION(get_html_translation_table)
/* break thru */
case HTML_SPECIALCHARS:
- for (j = 0; basic_entities[j].charcode != 0; j++) {
+ for (j = 0; basic_entities_ex[j].charcode != 0; j++) {
+ void *dummy;
- if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
+ if (basic_entities_ex[j].flags && (quote_style & basic_entities_ex[j].flags) == 0)
continue;
- ind[0] = (unsigned char)basic_entities[j].charcode;
- add_assoc_stringl(return_value, ind, basic_entities[j].entity, basic_entities[j].entitylen, 1);
+ ind[0] = (unsigned char)basic_entities_ex[j].charcode;
+ if (zend_hash_find(Z_ARRVAL_P(return_value), ind, sizeof(ind), &dummy) == FAILURE) {
+ /* in case of the single quote, which is repeated, the first one wins,
+ * so don't replace the existint mapping */
+ add_assoc_stringl(return_value, ind, basic_entities_ex[j].entity,
+ basic_entities_ex[j].entitylen, 1);
+ }
}
- add_assoc_stringl(return_value, "&", "&amp;", sizeof("&amp;") - 1, 1);
break;
}
diff --git a/ext/standard/html_tables.h b/ext/standard/html_tables.h
new file mode 100644
index 0000000000..d3a638b695
--- /dev/null
+++ b/ext/standard/html_tables.h
@@ -0,0 +1,2080 @@
+/*
+ +----------------------------------------------------------------------+
+ | PHP Version 5 |
+ +----------------------------------------------------------------------+
+ | Copyright (c) 1997-2010 The PHP Group |
+ +----------------------------------------------------------------------+
+ | This source file is subject to version 3.01 of the PHP license, |
+ | that is bundled with this package in the file LICENSE, and is |
+ | available through the world-wide-web at the following url: |
+ | http://www.php.net/license/3_01.txt |
+ | If you did not receive a copy of the PHP license and are unable to |
+ | obtain it through the world-wide-web, please send a note to |
+ | license@php.net so we can mail you a copy immediately. |
+ +----------------------------------------------------------------------+
+ | Author: Rasmus Lerdorf <rasmus@lerdorf.on.ca> |
+ +----------------------------------------------------------------------+
+*/
+
+/* $Id: html.h 293036 2010-01-03 09:23:27Z sebastian $ */
+
+#ifndef HTML_TABLES_H
+#define HTML_TABLES_H
+
+/* cs_terminator is overloaded in the following fashion:
+ * - It terminates the list entity maps.
+ * - In BG(inverse_ent_maps), it's the key of the inverse map that stores
+ * only the basic entities.
+ * - When passed to traverse_for_entities (or via php_unescape_entities with !all),
+ * we don't care about the encoding (UTF-8 is chosen, but it should be used
+ * when it doesn't matter).
+ */
+enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
+ cs_8859_15, cs_utf_8, cs_big5, cs_gb2312,
+ cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
+ cs_cp1251, cs_8859_5, cs_cp866, cs_macroman,
+ cs_numelems /* used to count the number of charsets */
+ };
+typedef const char *const entity_table_t;
+
+/* codepage 1252 is a Windows extension to iso-8859-1. */
+static entity_table_t ent_cp_1252[] = {
+ "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger",
+ "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
+ NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo",
+ "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
+ "oelig", NULL, NULL, "Yuml"
+};
+
+static entity_table_t ent_iso_8859_1[] = {
+ "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
+ "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
+ "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
+ "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
+ "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
+ "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
+ "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
+ "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
+ "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
+ "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
+ "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
+ "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
+ "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
+ "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
+ "uuml", "yacute", "thorn", "yuml"
+};
+
+static entity_table_t ent_iso_8859_15[] = {
+ "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
+ "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
+ "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */
+ "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm",
+ "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
+ "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
+ "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
+ "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
+ "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
+ "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
+ "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
+ "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
+ "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
+ "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
+ "uuml", "yacute", "thorn", "yuml"
+};
+
+static entity_table_t ent_uni_338_402[] = {
+ /* 338 (0x0152) */
+ "OElig", "oelig", NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 352 (0x0160) */
+ "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 376 (0x0178) */
+ "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 400 (0x0190) */
+ NULL, NULL, "fnof"
+};
+
+static entity_table_t ent_uni_spacing[] = {
+ /* 710 */
+ "circ",
+ /* 711 - 730 */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 731 - 732 */
+ NULL, "tilde"
+};
+
+static entity_table_t ent_uni_greek[] = {
+ /* 913 */
+ "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
+ "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
+ NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
+ /* 938 - 944 are not mapped */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
+ "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
+ "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
+ /* 970 - 976 are not mapped */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ "thetasym", "upsih",
+ NULL, NULL, NULL,
+ "piv"
+};
+
+static entity_table_t ent_uni_punct[] = {
+ /* 8194 */
+ "ensp", "emsp", NULL, NULL, NULL, NULL, NULL,
+ "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm",
+ NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL,
+ /* 8216 */
+ "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL,
+ "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip",
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL,
+ /* 8242 */
+ "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL,
+ NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL,
+ "frasl"
+};
+
+static entity_table_t ent_uni_euro[] = {
+ "euro"
+};
+
+static entity_table_t ent_uni_8465_8501[] = {
+ /* 8465 */
+ "image", NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8472 */
+ "weierp", NULL, NULL, NULL,
+ /* 8476 */
+ "real", NULL, NULL, NULL, NULL, NULL,
+ /* 8482 */
+ "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8501 */
+ "alefsym",
+};
+
+static entity_table_t ent_uni_8592_9002[] = {
+ /* 8592 (0x2190) */
+ "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8608 (0x21a0) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8624 (0x21b0) */
+ NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8640 (0x21c0) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8656 (0x21d0) */
+ "lArr", "uArr", "rArr", "dArr", "hArr", NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8672 (0x21e0) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8704 (0x2200) */
+ "forall", NULL, "part", "exist", NULL, "empty", NULL, "nabla",
+ "isin", "notin", NULL, "ni", NULL, NULL, NULL, "prod",
+ /* 8720 (0x2210) */
+ NULL, "sum", "minus", NULL, NULL, NULL, NULL, "lowast",
+ NULL, NULL, "radic", NULL, NULL, "prop", "infin", NULL,
+ /* 8736 (0x2220) */
+ "ang", NULL, NULL, NULL, NULL, NULL, NULL, "and",
+ "or", "cap", "cup", "int", NULL, NULL, NULL, NULL,
+ /* 8752 (0x2230) */
+ NULL, NULL, NULL, NULL, "there4", NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, "sim", NULL, NULL, NULL,
+ /* 8768 (0x2240) */
+ NULL, NULL, NULL, NULL, NULL, "cong", NULL, NULL,
+ "asymp", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8784 (0x2250) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8800 (0x2260) */
+ "ne", "equiv", NULL, NULL, "le", "ge", NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8816 (0x2270) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8832 (0x2280) */
+ NULL, NULL, "sub", "sup", "nsub", NULL, "sube", "supe",
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8848 (0x2290) */
+ NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes",
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8864 (0x22a0) */
+ NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8880 (0x22b0) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8896 (0x22c0) */
+ NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8912 (0x22d0) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8928 (0x22e0) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8944 (0x22f0) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8960 (0x2300) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL,
+ /* 8976 (0x2310) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8992 (0x2320) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, "lang", "rang"
+};
+
+static entity_table_t ent_uni_9674[] = {
+ /* 9674 */
+ "loz"
+};
+
+static entity_table_t ent_uni_9824_9830[] = {
+ /* 9824 */
+ "spades", NULL, NULL, "clubs", NULL, "hearts", "diams"
+};
+
+static entity_table_t ent_koi8r[] = {
+ "#1105", /* "jo "*/
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092",
+ "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084",
+ "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090",
+ "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096",
+ "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041",
+ "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048",
+ "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055",
+ "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042",
+ "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063",
+ "#1066"
+};
+
+static entity_table_t ent_cp_1251[] = {
+ "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger",
+ "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036",
+ "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220",
+ "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250",
+ "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118",
+ "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy",
+ "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn",
+ "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105",
+ "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111",
+ "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046",
+ "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053",
+ "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060",
+ "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067",
+ "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074",
+ "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081",
+ "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088",
+ "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095",
+ "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102",
+ "#1103"
+};
+
+static entity_table_t ent_iso_8859_5[] = {
+ "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062",
+ "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069",
+ "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076",
+ "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083",
+ "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090",
+ "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
+ "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104",
+ "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111",
+ "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118",
+ "#1119"
+};
+
+static entity_table_t ent_cp_866[] = {
+
+ "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566",
+ "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552",
+ "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560",
+ "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608",
+ "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090",
+ "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
+ "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025",
+ "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118",
+ "#176", "#8729", "#183", "#8730", "#8470", "#164", "#9632",
+ "#160"
+};
+
+/* MacRoman has a couple of low-ascii chars that need mapping too */
+/* Vertical tab (ASCII 11) is often used to store line breaks inside */
+/* DB exports, this mapping changes it to a space */
+static entity_table_t ent_macroman[] = {
+ "sp", NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, "quot", NULL,
+ NULL, NULL, "amp", NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, "lt", NULL, "gt", NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml",
+ "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring",
+ "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave",
+ "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml",
+ "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg",
+ "cent", "pound", "sect", "bull", "para", "szlig", "reg",
+ "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash",
+ "infin", "plusmn", "le", "ge", "yen", "micro", "part",
+ "sum", "prod", "pi", "int", "ordf", "ordm", "Omega",
+ "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof",
+ "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave",
+ "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo",
+ "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml",
+ "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger",
+ "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute",
+ "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute",
+ "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305",
+ "circ", "tilde", "macr", "#728", "#729", "#730", "cedil",
+ "#733", "#731", "#711"
+};
+
+struct html_entity_map {
+ enum entity_charset charset; /* charset identifier */
+ unsigned int basechar; /* char code at start of table */
+ unsigned int endchar; /* last char code in the table */
+ entity_table_t *table; /* the table of mappings */
+};
+
+static const struct html_entity_map entity_map[] = {
+ { cs_cp1252, 0x80, 0x9f, ent_cp_1252 },
+ { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 },
+ { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 },
+ { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 },
+ { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 },
+ { cs_utf_8, 338, 402, ent_uni_338_402 },
+ { cs_utf_8, 710, 732, ent_uni_spacing },
+ { cs_utf_8, 913, 982, ent_uni_greek },
+ { cs_utf_8, 8194, 8260, ent_uni_punct },
+ { cs_utf_8, 8364, 8364, ent_uni_euro },
+ { cs_utf_8, 8465, 8501, ent_uni_8465_8501 },
+ { cs_utf_8, 8592, 9002, ent_uni_8592_9002 },
+ { cs_utf_8, 9674, 9674, ent_uni_9674 },
+ { cs_utf_8, 9824, 9830, ent_uni_9824_9830 },
+ { cs_big5, 0xa0, 0xff, ent_iso_8859_1 },
+ { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 },
+ { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 },
+ { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 },
+ { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 },
+ { cs_koi8r, 0xa3, 0xff, ent_koi8r },
+ { cs_cp1251, 0x80, 0xff, ent_cp_1251 },
+ { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 },
+ { cs_cp866, 0xc0, 0xff, ent_cp_866 },
+ { cs_macroman, 0x0b, 0xff, ent_macroman },
+ { cs_terminator }
+};
+
+static const struct {
+ const char *codeset;
+ enum entity_charset charset;
+} charset_map[] = {
+ { "ISO-8859-1", cs_8859_1 },
+ { "ISO8859-1", cs_8859_1 },
+ { "ISO-8859-15", cs_8859_15 },
+ { "ISO8859-15", cs_8859_15 },
+ { "utf-8", cs_utf_8 },
+ { "cp1252", cs_cp1252 },
+ { "Windows-1252", cs_cp1252 },
+ { "1252", cs_cp1252 },
+ { "BIG5", cs_big5 },
+ { "950", cs_big5 },
+ { "GB2312", cs_gb2312 },
+ { "936", cs_gb2312 },
+ { "BIG5-HKSCS", cs_big5hkscs },
+ { "Shift_JIS", cs_sjis },
+ { "SJIS", cs_sjis },
+ { "932", cs_sjis },
+ { "EUCJP", cs_eucjp },
+ { "EUC-JP", cs_eucjp },
+ { "KOI8-R", cs_koi8r },
+ { "koi8-ru", cs_koi8r },
+ { "koi8r", cs_koi8r },
+ { "cp1251", cs_cp1251 },
+ { "Windows-1251", cs_cp1251 },
+ { "win-1251", cs_cp1251 },
+ { "iso8859-5", cs_8859_5 },
+ { "iso-8859-5", cs_8859_5 },
+ { "cp866", cs_cp866 },
+ { "866", cs_cp866 },
+ { "ibm866", cs_cp866 },
+ { "MacRoman", cs_macroman },
+ { NULL }
+};
+
+typedef struct {
+ unsigned short charcode;
+ char *entity;
+ int entitylen;
+ int flags;
+} basic_entity_t;
+
+static const basic_entity_t basic_entities_ex[] = {
+ { '&', "&amp;", 5, 0 },
+ { '"', "&quot;", 6, ENT_HTML_QUOTE_DOUBLE },
+ /* PHP traditionally encodes ' as &#039;, not &apos;, so leave this entry here */
+ { '\'', "&#039;", 6, ENT_HTML_QUOTE_SINGLE },
+ { '\'', "&apos;", 6, ENT_HTML_QUOTE_SINGLE },
+ { '<', "&lt;", 4, 0 },
+ { '>', "&gt;", 4, 0 },
+ { 0, NULL, 0, 0 }
+};
+
+/* In some cases, we need to give special treatment to &, so we
+ * use this instead */
+static const basic_entity_t *basic_entities = &basic_entities_ex[1];
+
+typedef struct {
+ unsigned short un_code_point; /* we don't need bigger */
+ unsigned char cs_code; /* currently, we only have maps to single-byte encodings */
+} unicode_mapping;
+
+static const unicode_mapping unimap_iso885915[] = {
+ { 0xA5, 0xA5 }, /* yen sign */
+ { 0xA7, 0xA7 }, /* section sign */
+ { 0xA9, 0xA9 }, /* copyright sign */
+ { 0xAA, 0xAA }, /* feminine ordinal indicator */
+ { 0xAB, 0xAB }, /* left-pointing double angle quotation mark */
+ { 0xAC, 0xAC }, /* not sign */
+ { 0xAD, 0xAD }, /* soft hyphen */
+ { 0xAE, 0xAE }, /* registered sign */
+ { 0xAF, 0xAF }, /* macron */
+ { 0xB0, 0xB0 }, /* degree sign */
+ { 0xB1, 0xB1 }, /* plus-minus sign */
+ { 0xB2, 0xB2 }, /* superscript two */
+ { 0xB3, 0xB3 }, /* superscript three */
+ { 0xB5, 0xB5 }, /* micro sign */
+ { 0xB6, 0xB6 }, /* pilcrow sign */
+ { 0xB7, 0xB7 }, /* middle dot */
+ { 0xB9, 0xB9 }, /* superscript one */
+ { 0xBA, 0xBA }, /* masculine ordinal indicator */
+ { 0xBB, 0xBB }, /* right-pointing double angle quotation mark */
+ { 0x152, 0xBC }, /* latin capital ligature oe */
+ { 0x153, 0xBD }, /* latin small ligature oe */
+ { 0x160, 0xA6 }, /* latin capital letter s with caron */
+ { 0x161, 0xA8 }, /* latin small letter s with caron */
+ { 0x178, 0xBE }, /* latin capital letter y with diaeresis */
+ { 0x17D, 0xB4 }, /* latin capital letter z with caron */
+ { 0x17E, 0xB8 }, /* latin small letter z with caron */
+ { 0x20AC, 0xA4 }, /* euro sign */
+};
+
+static const unicode_mapping unimap_win1252[] = {
+ { 0x152, 0x8C }, /* latin capital ligature oe */
+ { 0x153, 0x9C }, /* latin small ligature oe */
+ { 0x160, 0x8A }, /* latin capital letter s with caron */
+ { 0x161, 0x9A }, /* latin small letter s with caron */
+ { 0x178, 0x9F }, /* latin capital letter y with diaeresis */
+ { 0x17D, 0x8E }, /* latin capital letter z with caron */
+ { 0x17E, 0x9E }, /* latin small letter z with caron */
+ { 0x192, 0x83 }, /* latin small letter f with hook */
+ { 0x2C6, 0x88 }, /* modifier letter circumflex accent */
+ { 0x2DC, 0x98 }, /* small tilde */
+ { 0x2013, 0x96 }, /* en dash */
+ { 0x2014, 0x97 }, /* em dash */
+ { 0x2018, 0x91 }, /* left single quotation mark */
+ { 0x2019, 0x92 }, /* right single quotation mark */
+ { 0x201A, 0x82 }, /* single low-9 quotation mark */
+ { 0x201C, 0x93 }, /* left double quotation mark */
+ { 0x201D, 0x94 }, /* right double quotation mark */
+ { 0x201E, 0x84 }, /* double low-9 quotation mark */
+ { 0x2020, 0x86 }, /* dagger */
+ { 0x2021, 0x87 }, /* double dagger */
+ { 0x2022, 0x95 }, /* bullet */
+ { 0x2026, 0x85 }, /* horizontal ellipsis */
+ { 0x2030, 0x89 }, /* per mille sign */
+ { 0x2039, 0x8B }, /* single left-pointing angle quotation mark */
+ { 0x203A, 0x9B }, /* single right-pointing angle quotation mark */
+ { 0x20AC, 0x80 }, /* euro sign */
+ { 0x2122, 0x99 }, /* trade mark sign */
+};
+
+static const unicode_mapping unimap_win1251[] = {
+ { 0xA0, 0xA0 }, /* no-break space */
+ { 0xA4, 0xA4 }, /* currency sign */
+ { 0xA6, 0xA6 }, /* broken bar */
+ { 0xA7, 0xA7 }, /* section sign */
+ { 0xA9, 0xA9 }, /* copyright sign */
+ { 0xAB, 0xAB }, /* left-pointing double angle quotation mark */
+ { 0xAC, 0xAC }, /* not sign */
+ { 0xAD, 0xAD }, /* soft hyphen */
+ { 0xAE, 0xAE }, /* registered sign */
+ { 0xB0, 0xB0 }, /* degree sign */
+ { 0xB1, 0xB1 }, /* plus-minus sign */
+ { 0xB5, 0xB5 }, /* micro sign */
+ { 0xB6, 0xB6 }, /* pilcrow sign */
+ { 0xB7, 0xB7 }, /* middle dot */
+ { 0xBB, 0xBB }, /* right-pointing double angle quotation mark */
+ { 0x401, 0xA8 }, /* cyrillic capital letter io */
+ { 0x402, 0x80 }, /* cyrillic capital letter dje */
+ { 0x403, 0x81 }, /* cyrillic capital letter gje */
+ { 0x404, 0xAA }, /* cyrillic capital letter ukrainian ie */
+ { 0x405, 0xBD }, /* cyrillic capital letter dze */
+ { 0x406, 0xB2 }, /* cyrillic capital letter byelorussian-ukrainian i */
+ { 0x407, 0xAF }, /* cyrillic capital letter yi */
+ { 0x408, 0xA3 }, /* cyrillic capital letter je */
+ { 0x409, 0x8A }, /* cyrillic capital letter lje */
+ { 0x40A, 0x8C }, /* cyrillic capital letter nje */
+ { 0x40B, 0x8E }, /* cyrillic capital letter tshe */
+ { 0x40C, 0x8D }, /* cyrillic capital letter kje */
+ { 0x40E, 0xA1 }, /* cyrillic capital letter short u */
+ { 0x40F, 0x8F }, /* cyrillic capital letter dzhe */
+ { 0x410, 0xC0 }, /* cyrillic capital letter a */
+ { 0x411, 0xC1 }, /* cyrillic capital letter be */
+ { 0x412, 0xC2 }, /* cyrillic capital letter ve */
+ { 0x413, 0xC3 }, /* cyrillic capital letter ghe */
+ { 0x414, 0xC4 }, /* cyrillic capital letter de */
+ { 0x415, 0xC5 }, /* cyrillic capital letter ie */
+ { 0x416, 0xC6 }, /* cyrillic capital letter zhe */
+ { 0x417, 0xC7 }, /* cyrillic capital letter ze */
+ { 0x418, 0xC8 }, /* cyrillic capital letter i */
+ { 0x419, 0xC9 }, /* cyrillic capital letter short i */
+ { 0x41A, 0xCA }, /* cyrillic capital letter ka */
+ { 0x41B, 0xCB }, /* cyrillic capital letter el */
+ { 0x41C, 0xCC }, /* cyrillic capital letter em */
+ { 0x41D, 0xCD }, /* cyrillic capital letter en */
+ { 0x41E, 0xCE }, /* cyrillic capital letter o */
+ { 0x41F, 0xCF }, /* cyrillic capital letter pe */
+ { 0x420, 0xD0 }, /* cyrillic capital letter er */
+ { 0x421, 0xD1 }, /* cyrillic capital letter es */
+ { 0x422, 0xD2 }, /* cyrillic capital letter te */
+ { 0x423, 0xD3 }, /* cyrillic capital letter u */
+ { 0x424, 0xD4 }, /* cyrillic capital letter ef */
+ { 0x425, 0xD5 }, /* cyrillic capital letter ha */
+ { 0x426, 0xD6 }, /* cyrillic capital letter tse */
+ { 0x427, 0xD7 }, /* cyrillic capital letter che */
+ { 0x428, 0xD8 }, /* cyrillic capital letter sha */
+ { 0x429, 0xD9 }, /* cyrillic capital letter shcha */
+ { 0x42A, 0xDA }, /* cyrillic capital letter hard sign */
+ { 0x42B, 0xDB }, /* cyrillic capital letter yeru */
+ { 0x42C, 0xDC }, /* cyrillic capital letter soft sign */
+ { 0x42D, 0xDD }, /* cyrillic capital letter e */
+ { 0x42E, 0xDE }, /* cyrillic capital letter yu */
+ { 0x42F, 0xDF }, /* cyrillic capital letter ya */
+ { 0x430, 0xE0 }, /* cyrillic small letter a */
+ { 0x431, 0xE1 }, /* cyrillic small letter be */
+ { 0x432, 0xE2 }, /* cyrillic small letter ve */
+ { 0x433, 0xE3 }, /* cyrillic small letter ghe */
+ { 0x434, 0xE4 }, /* cyrillic small letter de */
+ { 0x435, 0xE5 }, /* cyrillic small letter ie */
+ { 0x436, 0xE6 }, /* cyrillic small letter zhe */
+ { 0x437, 0xE7 }, /* cyrillic small letter ze */
+ { 0x438, 0xE8 }, /* cyrillic small letter i */
+ { 0x439, 0xE9 }, /* cyrillic small letter short i */
+ { 0x43A, 0xEA }, /* cyrillic small letter ka */
+ { 0x43B, 0xEB }, /* cyrillic small letter el */
+ { 0x43C, 0xEC }, /* cyrillic small letter em */
+ { 0x43D, 0xED }, /* cyrillic small letter en */
+ { 0x43E, 0xEE }, /* cyrillic small letter o */
+ { 0x43F, 0xEF }, /* cyrillic small letter pe */
+ { 0x440, 0xF0 }, /* cyrillic small letter er */
+ { 0x441, 0xF1 }, /* cyrillic small letter es */
+ { 0x442, 0xF2 }, /* cyrillic small letter te */
+ { 0x443, 0xF3 }, /* cyrillic small letter u */
+ { 0x444, 0xF4 }, /* cyrillic small letter ef */
+ { 0x445, 0xF5 }, /* cyrillic small letter ha */
+ { 0x446, 0xF6 }, /* cyrillic small letter tse */
+ { 0x447, 0xF7 }, /* cyrillic small letter che */
+ { 0x448, 0xF8 }, /* cyrillic small letter sha */
+ { 0x449, 0xF9 }, /* cyrillic small letter shcha */
+ { 0x44A, 0xFA }, /* cyrillic small letter hard sign */
+ { 0x44B, 0xFB }, /* cyrillic small letter yeru */
+ { 0x44C, 0xFC }, /* cyrillic small letter soft sign */
+ { 0x44D, 0xFD }, /* cyrillic small letter e */
+ { 0x44E, 0xFE }, /* cyrillic small letter yu */
+ { 0x44F, 0xFF }, /* cyrillic small letter ya */
+ { 0x451, 0xB8 }, /* cyrillic small letter io */
+ { 0x452, 0x90 }, /* cyrillic small letter dje */
+ { 0x453, 0x83 }, /* cyrillic small letter gje */
+ { 0x454, 0xBA }, /* cyrillic small letter ukrainian ie */
+ { 0x455, 0xBE }, /* cyrillic small letter dze */
+ { 0x456, 0xB3 }, /* cyrillic small letter byelorussian-ukrainian i */
+ { 0x457, 0xBF }, /* cyrillic small letter yi */
+ { 0x458, 0xBC }, /* cyrillic small letter je */
+ { 0x459, 0x9A }, /* cyrillic small letter lje */
+ { 0x45A, 0x9C }, /* cyrillic small letter nje */
+ { 0x45B, 0x9E }, /* cyrillic small letter tshe */
+ { 0x45C, 0x9D }, /* cyrillic small letter kje */
+ { 0x45E, 0xA2 }, /* cyrillic small letter short u */
+ { 0x45F, 0x9F }, /* cyrillic small letter dzhe */
+ { 0x490, 0xA5 }, /* cyrillic capital letter ghe with upturn */
+ { 0x491, 0xB4 }, /* cyrillic small letter ghe with upturn */
+ { 0x2013, 0x96 }, /* en dash */
+ { 0x2014, 0x97 }, /* em dash */
+ { 0x2018, 0x91 }, /* left single quotation mark */
+ { 0x2019, 0x92 }, /* right single quotation mark */
+ { 0x201A, 0x82 }, /* single low-9 quotation mark */
+ { 0x201C, 0x93 }, /* left double quotation mark */
+ { 0x201D, 0x94 }, /* right double quotation mark */
+ { 0x201E, 0x84 }, /* double low-9 quotation mark */
+ { 0x2020, 0x86 }, /* dagger */
+ { 0x2021, 0x87 }, /* double dagger */
+ { 0x2022, 0x95 }, /* bullet */
+ { 0x2026, 0x85 }, /* horizontal ellipsis */
+ { 0x2030, 0x89 }, /* per mille sign */
+ { 0x2039, 0x8B }, /* single left-pointing angle quotation mark */
+ { 0x203A, 0x9B }, /* single right-pointing angle quotation mark */
+ { 0x20AC, 0x88 }, /* euro sign */
+ { 0x2116, 0xB9 }, /* numero sign */
+ { 0x2122, 0x99 }, /* trade mark sign */
+};
+
+static const unicode_mapping unimap_koi8r[] = {
+ { 0xA0, 0x9A }, /* no-break space */
+ { 0xA9, 0xBF }, /* copyright sign */
+ { 0xB0, 0x9C }, /* degree sign */
+ { 0xB2, 0x9D }, /* superscript two */
+ { 0xB7, 0x9E }, /* middle dot */
+ { 0xF7, 0x9F }, /* division sign */
+ { 0x401, 0xB3 }, /* cyrillic capital letter io */
+ { 0x410, 0xE1 }, /* cyrillic capital letter a */
+ { 0x411, 0xE2 }, /* cyrillic capital letter be */
+ { 0x412, 0xF7 }, /* cyrillic capital letter ve */
+ { 0x413, 0xE7 }, /* cyrillic capital letter ghe */
+ { 0x414, 0xE4 }, /* cyrillic capital letter de */
+ { 0x415, 0xE5 }, /* cyrillic capital letter ie */
+ { 0x416, 0xF6 }, /* cyrillic capital letter zhe */
+ { 0x417, 0xFA }, /* cyrillic capital letter ze */
+ { 0x418, 0xE9 }, /* cyrillic capital letter i */
+ { 0x419, 0xEA }, /* cyrillic capital letter short i */
+ { 0x41A, 0xEB }, /* cyrillic capital letter ka */
+ { 0x41B, 0xEC }, /* cyrillic capital letter el */
+ { 0x41C, 0xED }, /* cyrillic capital letter em */
+ { 0x41D, 0xEE }, /* cyrillic capital letter en */
+ { 0x41E, 0xEF }, /* cyrillic capital letter o */
+ { 0x41F, 0xF0 }, /* cyrillic capital letter pe */
+ { 0x420, 0xF2 }, /* cyrillic capital letter er */
+ { 0x421, 0xF3 }, /* cyrillic capital letter es */
+ { 0x422, 0xF4 }, /* cyrillic capital letter te */
+ { 0x423, 0xF5 }, /* cyrillic capital letter u */
+ { 0x424, 0xE6 }, /* cyrillic capital letter ef */
+ { 0x425, 0xE8 }, /* cyrillic capital letter ha */
+ { 0x426, 0xE3 }, /* cyrillic capital letter tse */
+ { 0x427, 0xFE }, /* cyrillic capital letter che */
+ { 0x428, 0xFB }, /* cyrillic capital letter sha */
+ { 0x429, 0xFD }, /* cyrillic capital letter shcha */
+ { 0x42A, 0xFF }, /* cyrillic capital letter hard sign */
+ { 0x42B, 0xF9 }, /* cyrillic capital letter yeru */
+ { 0x42C, 0xF8 }, /* cyrillic capital letter soft sign */
+ { 0x42D, 0xFC }, /* cyrillic capital letter e */
+ { 0x42E, 0xE0 }, /* cyrillic capital letter yu */
+ { 0x42F, 0xF1 }, /* cyrillic capital letter ya */
+ { 0x430, 0xC1 }, /* cyrillic small letter a */
+ { 0x431, 0xC2 }, /* cyrillic small letter be */
+ { 0x432, 0xD7 }, /* cyrillic small letter ve */
+ { 0x433, 0xC7 }, /* cyrillic small letter ghe */
+ { 0x434, 0xC4 }, /* cyrillic small letter de */
+ { 0x435, 0xC5 }, /* cyrillic small letter ie */
+ { 0x436, 0xD6 }, /* cyrillic small letter zhe */
+ { 0x437, 0xDA }, /* cyrillic small letter ze */
+ { 0x438, 0xC9 }, /* cyrillic small letter i */
+ { 0x439, 0xCA }, /* cyrillic small letter short i */
+ { 0x43A, 0xCB }, /* cyrillic small letter ka */
+ { 0x43B, 0xCC }, /* cyrillic small letter el */
+ { 0x43C, 0xCD }, /* cyrillic small letter em */
+ { 0x43D, 0xCE }, /* cyrillic small letter en */
+ { 0x43E, 0xCF }, /* cyrillic small letter o */
+ { 0x43F, 0xD0 }, /* cyrillic small letter pe */
+ { 0x440, 0xD2 }, /* cyrillic small letter er */
+ { 0x441, 0xD3 }, /* cyrillic small letter es */
+ { 0x442, 0xD4 }, /* cyrillic small letter te */
+ { 0x443, 0xD5 }, /* cyrillic small letter u */
+ { 0x444, 0xC6 }, /* cyrillic small letter ef */
+ { 0x445, 0xC8 }, /* cyrillic small letter ha */
+ { 0x446, 0xC3 }, /* cyrillic small letter tse */
+ { 0x447, 0xDE }, /* cyrillic small letter che */
+ { 0x448, 0xDB }, /* cyrillic small letter sha */
+ { 0x449, 0xDD }, /* cyrillic small letter shcha */
+ { 0x44A, 0xDF }, /* cyrillic small letter hard sign */
+ { 0x44B, 0xD9 }, /* cyrillic small letter yeru */
+ { 0x44C, 0xD8 }, /* cyrillic small letter soft sign */
+ { 0x44D, 0xDC }, /* cyrillic small letter e */
+ { 0x44E, 0xC0 }, /* cyrillic small letter yu */
+ { 0x44F, 0xD1 }, /* cyrillic small letter ya */
+ { 0x451, 0xA3 }, /* cyrillic small letter io */
+ { 0x2219, 0x95 }, /* bullet operator */
+ { 0x221A, 0x96 }, /* square root */
+ { 0x2248, 0x97 }, /* almost equal to */
+ { 0x2264, 0x98 }, /* less-than or equal to */
+ { 0x2265, 0x99 }, /* greater-than or equal to */
+ { 0x2320, 0x93 }, /* top half integral */
+ { 0x2321, 0x9B }, /* bottom half integral */
+ { 0x2500, 0x80 }, /* box drawings light horizontal */
+ { 0x2502, 0x81 }, /* box drawings light vertical */
+ { 0x250C, 0x82 }, /* box drawings light down and right */
+ { 0x2510, 0x83 }, /* box drawings light down and left */
+ { 0x2514, 0x84 }, /* box drawings light up and right */
+ { 0x2518, 0x85 }, /* box drawings light up and left */
+ { 0x251C, 0x86 }, /* box drawings light vertical and right */
+ { 0x2524, 0x87 }, /* box drawings light vertical and left */
+ { 0x252C, 0x88 }, /* box drawings light down and horizontal */
+ { 0x2534, 0x89 }, /* box drawings light up and horizontal */
+ { 0x253C, 0x8A }, /* box drawings light vertical and horizontal */
+ { 0x2550, 0xA0 }, /* box drawings double horizontal */
+ { 0x2551, 0xA1 }, /* box drawings double vertical */
+ { 0x2552, 0xA2 }, /* box drawings down single and right double */
+ { 0x2553, 0xA4 }, /* box drawings down double and right single */
+ { 0x2554, 0xA5 }, /* box drawings double down and right */
+ { 0x2555, 0xA6 }, /* box drawings down single and left double */
+ { 0x2556, 0xA7 }, /* box drawings down double and left single */
+ { 0x2557, 0xA8 }, /* box drawings double down and left */
+ { 0x2558, 0xA9 }, /* box drawings up single and right double */
+ { 0x2559, 0xAA }, /* box drawings up double and right single */
+ { 0x255A, 0xAB }, /* box drawings double up and right */
+ { 0x255B, 0xAC }, /* box drawings up single and left double */
+ { 0x255C, 0xAD }, /* box drawings up double and left single */
+ { 0x255D, 0xAE }, /* box drawings double up and left */
+ { 0x255E, 0xAF }, /* box drawings vertical single and right double */
+ { 0x255F, 0xB0 }, /* box drawings vertical double and right single */
+ { 0x2560, 0xB1 }, /* box drawings double vertical and right */
+ { 0x2561, 0xB2 }, /* box drawings vertical single and left double */
+ { 0x2562, 0xB4 }, /* box drawings vertical double and left single */
+ { 0x2563, 0xB5 }, /* box drawings double vertical and left */
+ { 0x2564, 0xB6 }, /* box drawings down single and horizontal double */
+ { 0x2565, 0xB7 }, /* box drawings down double and horizontal single */
+ { 0x2566, 0xB8 }, /* box drawings double down and horizontal */
+ { 0x2567, 0xB9 }, /* box drawings up single and horizontal double */
+ { 0x2568, 0xBA }, /* box drawings up double and horizontal single */
+ { 0x2569, 0xBB }, /* box drawings double up and horizontal */
+ { 0x256A, 0xBC }, /* box drawings vertical single and horizontal double */
+ { 0x256B, 0xBD }, /* box drawings vertical double and horizontal single */
+ { 0x256C, 0xBE }, /* box drawings double vertical and horizontal */
+ { 0x2580, 0x8B }, /* upper half block */
+ { 0x2584, 0x8C }, /* lower half block */
+ { 0x2588, 0x8D }, /* full block */
+ { 0x258C, 0x8E }, /* left half block */
+ { 0x2590, 0x8F }, /* right half block */
+ { 0x2591, 0x90 }, /* light shade */
+ { 0x2592, 0x91 }, /* medium shade */
+ { 0x2593, 0x92 }, /* dark shade */
+ { 0x25A0, 0x94 }, /* black square */
+};
+
+static const unicode_mapping unimap_cp866[] = {
+ { 0xA0, 0xFF }, /* no-break space */
+ { 0xA4, 0xFD }, /* currency sign */
+ { 0xB0, 0xF8 }, /* degree sign */
+ { 0xB7, 0xFA }, /* middle dot */
+ { 0x401, 0xF0 }, /* cyrillic capital letter io */
+ { 0x404, 0xF2 }, /* cyrillic capital letter ukrainian ie */
+ { 0x407, 0xF4 }, /* cyrillic capital letter yi */
+ { 0x40E, 0xF6 }, /* cyrillic capital letter short u */
+ { 0x410, 0x80 }, /* cyrillic capital letter a */
+ { 0x411, 0x81 }, /* cyrillic capital letter be */
+ { 0x412, 0x82 }, /* cyrillic capital letter ve */
+ { 0x413, 0x83 }, /* cyrillic capital letter ghe */
+ { 0x414, 0x84 }, /* cyrillic capital letter de */
+ { 0x415, 0x85 }, /* cyrillic capital letter ie */
+ { 0x416, 0x86 }, /* cyrillic capital letter zhe */
+ { 0x417, 0x87 }, /* cyrillic capital letter ze */
+ { 0x418, 0x88 }, /* cyrillic capital letter i */
+ { 0x419, 0x89 }, /* cyrillic capital letter short i */
+ { 0x41A, 0x8A }, /* cyrillic capital letter ka */
+ { 0x41B, 0x8B }, /* cyrillic capital letter el */
+ { 0x41C, 0x8C }, /* cyrillic capital letter em */
+ { 0x41D, 0x8D }, /* cyrillic capital letter en */
+ { 0x41E, 0x8E }, /* cyrillic capital letter o */
+ { 0x41F, 0x8F }, /* cyrillic capital letter pe */
+ { 0x420, 0x90 }, /* cyrillic capital letter er */
+ { 0x421, 0x91 }, /* cyrillic capital letter es */
+ { 0x422, 0x92 }, /* cyrillic capital letter te */
+ { 0x423, 0x93 }, /* cyrillic capital letter u */
+ { 0x424, 0x94 }, /* cyrillic capital letter ef */
+ { 0x425, 0x95 }, /* cyrillic capital letter ha */
+ { 0x426, 0x96 }, /* cyrillic capital letter tse */
+ { 0x427, 0x97 }, /* cyrillic capital letter che */
+ { 0x428, 0x98 }, /* cyrillic capital letter sha */
+ { 0x429, 0x99 }, /* cyrillic capital letter shcha */
+ { 0x42A, 0x9A }, /* cyrillic capital letter hard sign */
+ { 0x42B, 0x9B }, /* cyrillic capital letter yeru */
+ { 0x42C, 0x9C }, /* cyrillic capital letter soft sign */
+ { 0x42D, 0x9D }, /* cyrillic capital letter e */
+ { 0x42E, 0x9F }, /* cyrillic capital letter ya */
+ { 0x430, 0xA0 }, /* cyrillic small letter a */
+ { 0x431, 0xA1 }, /* cyrillic small letter be */
+ { 0x432, 0xA2 }, /* cyrillic small letter ve */
+ { 0x433, 0xA3 }, /* cyrillic small letter ghe */
+ { 0x434, 0xA4 }, /* cyrillic small letter de */
+ { 0x435, 0xA5 }, /* cyrillic small letter ie */
+ { 0x436, 0xA6 }, /* cyrillic small letter zhe */
+ { 0x437, 0xA7 }, /* cyrillic small letter ze */
+ { 0x438, 0xA8 }, /* cyrillic small letter i */
+ { 0x439, 0xA9 }, /* cyrillic small letter short i */
+ { 0x43A, 0xAA }, /* cyrillic small letter ka */
+ { 0x43B, 0xAB }, /* cyrillic small letter el */
+ { 0x43C, 0xAC }, /* cyrillic small letter em */
+ { 0x43D, 0xAD }, /* cyrillic small letter en */
+ { 0x43E, 0xAE }, /* cyrillic small letter o */
+ { 0x43F, 0xAF }, /* cyrillic small letter pe */
+ { 0x440, 0xE0 }, /* cyrillic small letter er */
+ { 0x441, 0xE1 }, /* cyrillic small letter es */
+ { 0x442, 0xE2 }, /* cyrillic small letter te */
+ { 0x443, 0xE3 }, /* cyrillic small letter u */
+ { 0x444, 0xE4 }, /* cyrillic small letter ef */
+ { 0x445, 0xE5 }, /* cyrillic small letter ha */
+ { 0x446, 0xE6 }, /* cyrillic small letter tse */
+ { 0x447, 0xE7 }, /* cyrillic small letter che */
+ { 0x448, 0xE8 }, /* cyrillic small letter sha */
+ { 0x449, 0xE9 }, /* cyrillic small letter shcha */
+ { 0x44A, 0xEA }, /* cyrillic small letter hard sign */
+ { 0x44B, 0xEB }, /* cyrillic small letter yeru */
+ { 0x44C, 0xEC }, /* cyrillic small letter soft sign */
+ { 0x44D, 0xED }, /* cyrillic small letter e */
+ { 0x44E, 0xEE }, /* cyrillic small letter yu */
+ { 0x44F, 0xEF }, /* cyrillic small letter ya */
+ { 0x451, 0xF1 }, /* cyrillic small letter io */
+ { 0x454, 0xF3 }, /* cyrillic small letter ukrainian ie */
+ { 0x457, 0xF5 }, /* cyrillic small letter yi */
+ { 0x45E, 0xF7 }, /* cyrillic small letter short u */
+ { 0x2116, 0xFC }, /* numero sign */
+ { 0x2219, 0xF9 }, /* bullet operator */
+ { 0x221A, 0xFB }, /* square root */
+ { 0x2500, 0xC4 }, /* box drawings light horizontal */
+ { 0x2502, 0xB3 }, /* box drawings light vertical */
+ { 0x250C, 0xDA }, /* box drawings light down and right */
+ { 0x2510, 0xBF }, /* box drawings light down and left */
+ { 0x2514, 0xC0 }, /* box drawings light up and right */
+ { 0x2518, 0xD9 }, /* box drawings light up and left */
+ { 0x251C, 0xC3 }, /* box drawings light vertical and right */
+ { 0x2524, 0xB4 }, /* box drawings light vertical and left */
+ { 0x252C, 0xC2 }, /* box drawings light down and horizontal */
+ { 0x2534, 0xC1 }, /* box drawings light up and horizontal */
+ { 0x253C, 0xC5 }, /* box drawings light vertical and horizontal */
+ { 0x2550, 0xCD }, /* box drawings double horizontal */
+ { 0x2551, 0xBA }, /* box drawings double vertical */
+ { 0x2552, 0xD5 }, /* box drawings down single and right double */
+ { 0x2553, 0xD6 }, /* box drawings down double and right single */
+ { 0x2554, 0xC9 }, /* box drawings double down and right */
+ { 0x2555, 0xB8 }, /* box drawings down single and left double */
+ { 0x2556, 0xB7 }, /* box drawings down double and left single */
+ { 0x2557, 0xBB }, /* box drawings double down and left */
+ { 0x2558, 0xD4 }, /* box drawings up single and right double */
+ { 0x2559, 0xD3 }, /* box drawings up double and right single */
+ { 0x255A, 0xC8 }, /* box drawings double up and right */
+ { 0x255B, 0xBE }, /* box drawings up single and left double */
+ { 0x255C, 0xBD }, /* box drawings up double and left single */
+ { 0x255D, 0xBC }, /* box drawings double up and left */
+ { 0x255E, 0xC6 }, /* box drawings vertical single and right double */
+ { 0x255F, 0xC7 }, /* box drawings vertical double and right single */
+ { 0x2560, 0xCC }, /* box drawings double vertical and right */
+ { 0x2561, 0xB5 }, /* box drawings vertical single and left double */
+ { 0x2562, 0xB6 }, /* box drawings vertical double and left single */
+ { 0x2563, 0xB9 }, /* box drawings double vertical and left */
+ { 0x2564, 0xD1 }, /* box drawings down single and horizontal double */
+ { 0x2565, 0xD2 }, /* box drawings down double and horizontal single */
+ { 0x2566, 0xCB }, /* box drawings double down and horizontal */
+ { 0x2567, 0xCF }, /* box drawings up single and horizontal double */
+ { 0x2568, 0xD0 }, /* box drawings up double and horizontal single */
+ { 0x2569, 0xCA }, /* box drawings double up and horizontal */
+ { 0x256A, 0xD8 }, /* box drawings vertical single and horizontal double */
+ { 0x256B, 0xD7 }, /* box drawings vertical double and horizontal single */
+ { 0x256C, 0xCE }, /* box drawings double vertical and horizontal */
+ { 0x2580, 0xDF }, /* upper half block */
+ { 0x2584, 0xDC }, /* lower half block */
+ { 0x2588, 0xDB }, /* full block */
+ { 0x258C, 0xDD }, /* left half block */
+ { 0x2590, 0xDE }, /* right half block */
+ { 0x2591, 0xB0 }, /* light shade */
+ { 0x2592, 0xB1 }, /* medium shade */
+ { 0x2593, 0xB2 }, /* dark shade */
+ { 0x25A0, 0xFE }, /* black square */
+};
+
+static const unicode_mapping unimap_macroman[] = {
+ { 0xA0, 0xCA }, /* no-break space */
+ { 0xA1, 0xC1 }, /* inverted exclamation mark */
+ { 0xA2, 0xA2 }, /* cent sign */
+ { 0xA3, 0xA3 }, /* pound sign */
+ { 0xA5, 0xB4 }, /* yen sign */
+ { 0xA7, 0xA4 }, /* section sign */
+ { 0xA8, 0xAC }, /* diaeresis */
+ { 0xA9, 0xA9 }, /* copyright sign */
+ { 0xAA, 0xBB }, /* feminine ordinal indicator */
+ { 0xAB, 0xC7 }, /* left-pointing double angle quotation mark */
+ { 0xAC, 0xC2 }, /* not sign */
+ { 0xAE, 0xA8 }, /* registered sign */
+ { 0xAF, 0xF8 }, /* macron */
+ { 0xB0, 0xA1 }, /* degree sign */
+ { 0xB1, 0xB1 }, /* plus-minus sign */
+ { 0xB4, 0xAB }, /* acute accent */
+ { 0xB5, 0xB5 }, /* micro sign */
+ { 0xB6, 0xA6 }, /* pilcrow sign */
+ { 0xB7, 0xE1 }, /* middle dot */
+ { 0xB8, 0xFC }, /* cedilla */
+ { 0xBA, 0xBC }, /* masculine ordinal indicator */
+ { 0xBB, 0xC8 }, /* right-pointing double angle quotation mark */
+ { 0xBF, 0xC0 }, /* inverted question mark */
+ { 0xC0, 0xCB }, /* latin capital letter a with grave */
+ { 0xC1, 0xE7 }, /* latin capital letter a with acute */
+ { 0xC2, 0xE5 }, /* latin capital letter a with circumflex */
+ { 0xC3, 0xCC }, /* latin capital letter a with tilde */
+ { 0xC4, 0x80 }, /* latin capital letter a with diaeresis */
+ { 0xC5, 0x81 }, /* latin capital letter a with ring above */
+ { 0xC6, 0xAE }, /* latin capital letter ae */
+ { 0xC7, 0x82 }, /* latin capital letter c with cedilla */
+ { 0xC8, 0xE9 }, /* latin capital letter e with grave */
+ { 0xC9, 0x83 }, /* latin capital letter e with acute */
+ { 0xCA, 0xE6 }, /* latin capital letter e with circumflex */
+ { 0xCB, 0xE8 }, /* latin capital letter e with diaeresis */
+ { 0xCC, 0xED }, /* latin capital letter i with grave */
+ { 0xCD, 0xEA }, /* latin capital letter i with acute */
+ { 0xCE, 0xEB }, /* latin capital letter i with circumflex */
+ { 0xCF, 0xEC }, /* latin capital letter i with diaeresis */
+ { 0xD1, 0x84 }, /* latin capital letter n with tilde */
+ { 0xD2, 0xF1 }, /* latin capital letter o with grave */
+ { 0xD3, 0xEE }, /* latin capital letter o with acute */
+ { 0xD4, 0xEF }, /* latin capital letter o with circumflex */
+ { 0xD5, 0xCD }, /* latin capital letter o with tilde */
+ { 0xD6, 0x85 }, /* latin capital letter o with diaeresis */
+ { 0xD8, 0xAF }, /* latin capital letter o with stroke */
+ { 0xD9, 0xF4 }, /* latin capital letter u with grave */
+ { 0xDA, 0xF2 }, /* latin capital letter u with acute */
+ { 0xDB, 0xF3 }, /* latin capital letter u with circumflex */
+ { 0xDC, 0x86 }, /* latin capital letter u with diaeresis */
+ { 0xDF, 0xA7 }, /* latin small letter sharp s */
+ { 0xE0, 0x88 }, /* latin small letter a with grave */
+ { 0xE1, 0x87 }, /* latin small letter a with acute */
+ { 0xE2, 0x89 }, /* latin small letter a with circumflex */
+ { 0xE3, 0x8B }, /* latin small letter a with tilde */
+ { 0xE4, 0x8A }, /* latin small letter a with diaeresis */
+ { 0xE5, 0x8C }, /* latin small letter a with ring above */
+ { 0xE6, 0xBE }, /* latin small letter ae */
+ { 0xE7, 0x8D }, /* latin small letter c with cedilla */
+ { 0xE8, 0x8F }, /* latin small letter e with grave */
+ { 0xE9, 0x8E }, /* latin small letter e with acute */
+ { 0xEA, 0x90 }, /* latin small letter e with circumflex */
+ { 0xEB, 0x91 }, /* latin small letter e with diaeresis */
+ { 0xEC, 0x93 }, /* latin small letter i with grave */
+ { 0xED, 0x92 }, /* latin small letter i with acute */
+ { 0xEE, 0x94 }, /* latin small letter i with circumflex */
+ { 0xEF, 0x95 }, /* latin small letter i with diaeresis */
+ { 0xF1, 0x96 }, /* latin small letter n with tilde */
+ { 0xF2, 0x98 }, /* latin small letter o with grave */
+ { 0xF3, 0x97 }, /* latin small letter o with acute */
+ { 0xF4, 0x99 }, /* latin small letter o with circumflex */
+ { 0xF5, 0x9B }, /* latin small letter o with tilde */
+ { 0xF6, 0x9A }, /* latin small letter o with diaeresis */
+ { 0xF7, 0xD6 }, /* division sign */
+ { 0xF8, 0xBF }, /* latin small letter o with stroke */
+ { 0xF9, 0x9D }, /* latin small letter u with grave */
+ { 0xFA, 0x9C }, /* latin small letter u with acute */
+ { 0xFB, 0x9E }, /* latin small letter u with circumflex */
+ { 0xFC, 0x9F }, /* latin small letter u with diaeresis */
+ { 0xFF, 0xD8 }, /* latin small letter y with diaeresis */
+ { 0x131, 0xF5 }, /* latin small letter dotless i */
+ { 0x152, 0xCE }, /* latin capital ligature oe */
+ { 0x153, 0xCF }, /* latin small ligature oe */
+ { 0x178, 0xD9 }, /* latin capital letter y with diaeresis */
+ { 0x192, 0xC4 }, /* latin small letter f with hook */
+ { 0x2C6, 0xF6 }, /* modifier letter circumflex accent */
+ { 0x2C7, 0xFF }, /* caron */
+ { 0x2D8, 0xF9 }, /* breve */
+ { 0x2D9, 0xFA }, /* dot above */
+ { 0x2DA, 0xFB }, /* ring above */
+ { 0x2DB, 0xFE }, /* ogonek */
+ { 0x2DC, 0xF7 }, /* small tilde */
+ { 0x2DD, 0xFD }, /* double acute accent */
+ { 0x3A9, 0xBD }, /* greek capital letter omega */
+ { 0x3C0, 0xB9 }, /* greek small letter pi */
+ { 0x2013, 0xD0 }, /* en dash */
+ { 0x2014, 0xD1 }, /* em dash */
+ { 0x2018, 0xD4 }, /* left single quotation mark */
+ { 0x2019, 0xD5 }, /* right single quotation mark */
+ { 0x201A, 0xE2 }, /* single low-9 quotation mark */
+ { 0x201C, 0xD2 }, /* left double quotation mark */
+ { 0x201D, 0xD3 }, /* right double quotation mark */
+ { 0x201E, 0xE3 }, /* double low-9 quotation mark */
+ { 0x2020, 0xA0 }, /* dagger */
+ { 0x2021, 0xE0 }, /* double dagger */
+ { 0x2022, 0xA5 }, /* bullet */
+ { 0x2026, 0xC9 }, /* horizontal ellipsis */
+ { 0x2030, 0xE4 }, /* per mille sign */
+ { 0x2039, 0xDC }, /* single left-pointing angle quotation mark */
+ { 0x203A, 0xDD }, /* single right-pointing angle quotation mark */
+ { 0x2044, 0xDA }, /* fraction slash */
+ { 0x20AC, 0xDB }, /* euro sign */
+ { 0x2122, 0xAA }, /* trade mark sign */
+ { 0x2202, 0xB6 }, /* partial differential */
+ { 0x2206, 0xC6 }, /* increment */
+ { 0x220F, 0xB8 }, /* n-ary product */
+ { 0x2211, 0xB7 }, /* n-ary summation */
+ { 0x221A, 0xC3 }, /* square root */
+ { 0x221E, 0xB0 }, /* infinity */
+ { 0x222B, 0xBA }, /* integral */
+ { 0x2248, 0xC5 }, /* almost equal to */
+ { 0x2260, 0xAD }, /* not equal to */
+ { 0x2264, 0xB2 }, /* less-than or equal to */
+ { 0x2265, 0xB3 }, /* greater-than or equal to */
+ { 0x25CA, 0xD7 }, /* lozenge */
+ { 0xF8FF, 0xF0 }, /* apple logo */
+ { 0xFB01, 0xDE }, /* latin small ligature fi */
+ { 0xFB02, 0xDF }, /* latin small ligature fl */
+};
+
+#endif /* HTML_TABLES_H */
+/*
+ +----------------------------------------------------------------------+
+ | PHP Version 5 |
+ +----------------------------------------------------------------------+
+ | Copyright (c) 1997-2010 The PHP Group |
+ +----------------------------------------------------------------------+
+ | This source file is subject to version 3.01 of the PHP license, |
+ | that is bundled with this package in the file LICENSE, and is |
+ | available through the world-wide-web at the following url: |
+ | http://www.php.net/license/3_01.txt |
+ | If you did not receive a copy of the PHP license and are unable to |
+ | obtain it through the world-wide-web, please send a note to |
+ | license@php.net so we can mail you a copy immediately. |
+ +----------------------------------------------------------------------+
+ | Author: Rasmus Lerdorf <rasmus@lerdorf.on.ca> |
+ +----------------------------------------------------------------------+
+*/
+
+/* $Id: html.h 293036 2010-01-03 09:23:27Z sebastian $ */
+
+#ifndef HTML_TABLES_H
+#define HTML_TABLES_H
+
+/* cs_terminator is overloaded in the following fashion:
+ * - It terminates the list entity maps.
+ * - In BG(inverse_ent_maps), it's the key of the inverse map that stores
+ * only the basic entities.
+ * - When passed to traverse_for_entities (or via php_unescape_entities with !all),
+ * we don't care about the encoding (UTF-8 is chosen, but it should be used
+ * when it doesn't matter).
+ */
+enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
+ cs_8859_15, cs_utf_8, cs_big5, cs_gb2312,
+ cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
+ cs_cp1251, cs_8859_5, cs_cp866, cs_macroman,
+ cs_numelems /* used to count the number of charsets */
+ };
+typedef const char *const entity_table_t;
+
+/* codepage 1252 is a Windows extension to iso-8859-1. */
+static entity_table_t ent_cp_1252[] = {
+ "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger",
+ "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
+ NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo",
+ "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
+ "oelig", NULL, NULL, "Yuml"
+};
+
+static entity_table_t ent_iso_8859_1[] = {
+ "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
+ "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
+ "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
+ "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
+ "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
+ "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
+ "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
+ "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
+ "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
+ "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
+ "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
+ "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
+ "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
+ "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
+ "uuml", "yacute", "thorn", "yuml"
+};
+
+static entity_table_t ent_iso_8859_15[] = {
+ "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
+ "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
+ "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */
+ "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm",
+ "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
+ "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
+ "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
+ "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
+ "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
+ "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
+ "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
+ "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
+ "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
+ "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
+ "uuml", "yacute", "thorn", "yuml"
+};
+
+static entity_table_t ent_uni_338_402[] = {
+ /* 338 (0x0152) */
+ "OElig", "oelig", NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 352 (0x0160) */
+ "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 376 (0x0178) */
+ "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 400 (0x0190) */
+ NULL, NULL, "fnof"
+};
+
+static entity_table_t ent_uni_spacing[] = {
+ /* 710 */
+ "circ",
+ /* 711 - 730 */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 731 - 732 */
+ NULL, "tilde"
+};
+
+static entity_table_t ent_uni_greek[] = {
+ /* 913 */
+ "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
+ "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
+ NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
+ /* 938 - 944 are not mapped */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
+ "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
+ "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
+ /* 970 - 976 are not mapped */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ "thetasym", "upsih",
+ NULL, NULL, NULL,
+ "piv"
+};
+
+static entity_table_t ent_uni_punct[] = {
+ /* 8194 */
+ "ensp", "emsp", NULL, NULL, NULL, NULL, NULL,
+ "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm",
+ NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL,
+ /* 8216 */
+ "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL,
+ "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip",
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL,
+ /* 8242 */
+ "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL,
+ NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL,
+ "frasl"
+};
+
+static entity_table_t ent_uni_euro[] = {
+ "euro"
+};
+
+static entity_table_t ent_uni_8465_8501[] = {
+ /* 8465 */
+ "image", NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8472 */
+ "weierp", NULL, NULL, NULL,
+ /* 8476 */
+ "real", NULL, NULL, NULL, NULL, NULL,
+ /* 8482 */
+ "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8501 */
+ "alefsym",
+};
+
+static entity_table_t ent_uni_8592_9002[] = {
+ /* 8592 (0x2190) */
+ "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8608 (0x21a0) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8624 (0x21b0) */
+ NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8640 (0x21c0) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8656 (0x21d0) */
+ "lArr", "uArr", "rArr", "dArr", "hArr", NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8672 (0x21e0) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8704 (0x2200) */
+ "forall", NULL, "part", "exist", NULL, "empty", NULL, "nabla",
+ "isin", "notin", NULL, "ni", NULL, NULL, NULL, "prod",
+ /* 8720 (0x2210) */
+ NULL, "sum", "minus", NULL, NULL, NULL, NULL, "lowast",
+ NULL, NULL, "radic", NULL, NULL, "prop", "infin", NULL,
+ /* 8736 (0x2220) */
+ "ang", NULL, NULL, NULL, NULL, NULL, NULL, "and",
+ "or", "cap", "cup", "int", NULL, NULL, NULL, NULL,
+ /* 8752 (0x2230) */
+ NULL, NULL, NULL, NULL, "there4", NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, "sim", NULL, NULL, NULL,
+ /* 8768 (0x2240) */
+ NULL, NULL, NULL, NULL, NULL, "cong", NULL, NULL,
+ "asymp", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8784 (0x2250) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8800 (0x2260) */
+ "ne", "equiv", NULL, NULL, "le", "ge", NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8816 (0x2270) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8832 (0x2280) */
+ NULL, NULL, "sub", "sup", "nsub", NULL, "sube", "supe",
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8848 (0x2290) */
+ NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes",
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8864 (0x22a0) */
+ NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8880 (0x22b0) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8896 (0x22c0) */
+ NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8912 (0x22d0) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8928 (0x22e0) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8944 (0x22f0) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8960 (0x2300) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL,
+ /* 8976 (0x2310) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ /* 8992 (0x2320) */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, "lang", "rang"
+};
+
+static entity_table_t ent_uni_9674[] = {
+ /* 9674 */
+ "loz"
+};
+
+static entity_table_t ent_uni_9824_9830[] = {
+ /* 9824 */
+ "spades", NULL, NULL, "clubs", NULL, "hearts", "diams"
+};
+
+static entity_table_t ent_koi8r[] = {
+ "#1105", /* "jo "*/
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092",
+ "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084",
+ "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090",
+ "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096",
+ "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041",
+ "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048",
+ "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055",
+ "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042",
+ "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063",
+ "#1066"
+};
+
+static entity_table_t ent_cp_1251[] = {
+ "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger",
+ "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036",
+ "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220",
+ "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250",
+ "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118",
+ "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy",
+ "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn",
+ "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105",
+ "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111",
+ "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046",
+ "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053",
+ "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060",
+ "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067",
+ "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074",
+ "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081",
+ "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088",
+ "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095",
+ "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102",
+ "#1103"
+};
+
+static entity_table_t ent_iso_8859_5[] = {
+ "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062",
+ "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069",
+ "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076",
+ "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083",
+ "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090",
+ "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
+ "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104",
+ "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111",
+ "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118",
+ "#1119"
+};
+
+static entity_table_t ent_cp_866[] = {
+
+ "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566",
+ "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552",
+ "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560",
+ "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608",
+ "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090",
+ "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
+ "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025",
+ "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118",
+ "#176", "#8729", "#183", "#8730", "#8470", "#164", "#9632",
+ "#160"
+};
+
+/* MacRoman has a couple of low-ascii chars that need mapping too */
+/* Vertical tab (ASCII 11) is often used to store line breaks inside */
+/* DB exports, this mapping changes it to a space */
+static entity_table_t ent_macroman[] = {
+ "sp", NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, "quot", NULL,
+ NULL, NULL, "amp", NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, "lt", NULL, "gt", NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml",
+ "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring",
+ "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave",
+ "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml",
+ "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg",
+ "cent", "pound", "sect", "bull", "para", "szlig", "reg",
+ "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash",
+ "infin", "plusmn", "le", "ge", "yen", "micro", "part",
+ "sum", "prod", "pi", "int", "ordf", "ordm", "Omega",
+ "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof",
+ "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave",
+ "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo",
+ "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml",
+ "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger",
+ "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute",
+ "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute",
+ "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305",
+ "circ", "tilde", "macr", "#728", "#729", "#730", "cedil",
+ "#733", "#731", "#711"
+};
+
+struct html_entity_map {
+ enum entity_charset charset; /* charset identifier */
+ unsigned int basechar; /* char code at start of table */
+ unsigned int endchar; /* last char code in the table */
+ entity_table_t *table; /* the table of mappings */
+};
+
+static const struct html_entity_map entity_map[] = {
+ { cs_cp1252, 0x80, 0x9f, ent_cp_1252 },
+ { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 },
+ { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 },
+ { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 },
+ { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 },
+ { cs_utf_8, 338, 402, ent_uni_338_402 },
+ { cs_utf_8, 710, 732, ent_uni_spacing },
+ { cs_utf_8, 913, 982, ent_uni_greek },
+ { cs_utf_8, 8194, 8260, ent_uni_punct },
+ { cs_utf_8, 8364, 8364, ent_uni_euro },
+ { cs_utf_8, 8465, 8501, ent_uni_8465_8501 },
+ { cs_utf_8, 8592, 9002, ent_uni_8592_9002 },
+ { cs_utf_8, 9674, 9674, ent_uni_9674 },
+ { cs_utf_8, 9824, 9830, ent_uni_9824_9830 },
+ { cs_big5, 0xa0, 0xff, ent_iso_8859_1 },
+ { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 },
+ { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 },
+ { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 },
+ { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 },
+ { cs_koi8r, 0xa3, 0xff, ent_koi8r },
+ { cs_cp1251, 0x80, 0xff, ent_cp_1251 },
+ { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 },
+ { cs_cp866, 0xc0, 0xff, ent_cp_866 },
+ { cs_macroman, 0x0b, 0xff, ent_macroman },
+ { cs_terminator }
+};
+
+static const struct {
+ const char *codeset;
+ enum entity_charset charset;
+} charset_map[] = {
+ { "ISO-8859-1", cs_8859_1 },
+ { "ISO8859-1", cs_8859_1 },
+ { "ISO-8859-15", cs_8859_15 },
+ { "ISO8859-15", cs_8859_15 },
+ { "utf-8", cs_utf_8 },
+ { "cp1252", cs_cp1252 },
+ { "Windows-1252", cs_cp1252 },
+ { "1252", cs_cp1252 },
+ { "BIG5", cs_big5 },
+ { "950", cs_big5 },
+ { "GB2312", cs_gb2312 },
+ { "936", cs_gb2312 },
+ { "BIG5-HKSCS", cs_big5hkscs },
+ { "Shift_JIS", cs_sjis },
+ { "SJIS", cs_sjis },
+ { "932", cs_sjis },
+ { "EUCJP", cs_eucjp },
+ { "EUC-JP", cs_eucjp },
+ { "KOI8-R", cs_koi8r },
+ { "koi8-ru", cs_koi8r },
+ { "koi8r", cs_koi8r },
+ { "cp1251", cs_cp1251 },
+ { "Windows-1251", cs_cp1251 },
+ { "win-1251", cs_cp1251 },
+ { "iso8859-5", cs_8859_5 },
+ { "iso-8859-5", cs_8859_5 },
+ { "cp866", cs_cp866 },
+ { "866", cs_cp866 },
+ { "ibm866", cs_cp866 },
+ { "MacRoman", cs_macroman },
+ { NULL }
+};
+
+typedef struct {
+ unsigned short charcode;
+ char *entity;
+ int entitylen;
+ int flags;
+} basic_entity_t;
+
+static const basic_entity_t basic_entities_ex[] = {
+ { '&', "&amp;", 5, 0 },
+ { '"', "&quot;", 6, ENT_HTML_QUOTE_DOUBLE },
+ /* PHP traditionally encodes ' as &#039;, not &apos;, so leave this entry here */
+ { '\'', "&#039;", 6, ENT_HTML_QUOTE_SINGLE },
+ { '\'', "&apos;", 6, ENT_HTML_QUOTE_SINGLE },
+ { '<', "&lt;", 4, 0 },
+ { '>', "&gt;", 4, 0 },
+ { 0, NULL, 0, 0 }
+};
+
+/* In some cases, we need to give special treatment to &, so we
+ * use this instead */
+static const basic_entity_t *basic_entities = &basic_entities_ex[1];
+
+typedef struct {
+ unsigned short un_code_point; /* we don't need bigger */
+ unsigned char cs_code; /* currently, we only have maps to single-byte encodings */
+} unicode_mapping;
+
+static const unicode_mapping unimap_iso885915[] = {
+ { 0xA5, 0xA5 }, /* yen sign */
+ { 0xA7, 0xA7 }, /* section sign */
+ { 0xA9, 0xA9 }, /* copyright sign */
+ { 0xAA, 0xAA }, /* feminine ordinal indicator */
+ { 0xAB, 0xAB }, /* left-pointing double angle quotation mark */
+ { 0xAC, 0xAC }, /* not sign */
+ { 0xAD, 0xAD }, /* soft hyphen */
+ { 0xAE, 0xAE }, /* registered sign */
+ { 0xAF, 0xAF }, /* macron */
+ { 0xB0, 0xB0 }, /* degree sign */
+ { 0xB1, 0xB1 }, /* plus-minus sign */
+ { 0xB2, 0xB2 }, /* superscript two */
+ { 0xB3, 0xB3 }, /* superscript three */
+ { 0xB5, 0xB5 }, /* micro sign */
+ { 0xB6, 0xB6 }, /* pilcrow sign */
+ { 0xB7, 0xB7 }, /* middle dot */
+ { 0xB9, 0xB9 }, /* superscript one */
+ { 0xBA, 0xBA }, /* masculine ordinal indicator */
+ { 0xBB, 0xBB }, /* right-pointing double angle quotation mark */
+ { 0x152, 0xBC }, /* latin capital ligature oe */
+ { 0x153, 0xBD }, /* latin small ligature oe */
+ { 0x160, 0xA6 }, /* latin capital letter s with caron */
+ { 0x161, 0xA8 }, /* latin small letter s with caron */
+ { 0x178, 0xBE }, /* latin capital letter y with diaeresis */
+ { 0x17D, 0xB4 }, /* latin capital letter z with caron */
+ { 0x17E, 0xB8 }, /* latin small letter z with caron */
+ { 0x20AC, 0xA4 }, /* euro sign */
+};
+
+static const unicode_mapping unimap_win1252[] = {
+ { 0x152, 0x8C }, /* latin capital ligature oe */
+ { 0x153, 0x9C }, /* latin small ligature oe */
+ { 0x160, 0x8A }, /* latin capital letter s with caron */
+ { 0x161, 0x9A }, /* latin small letter s with caron */
+ { 0x178, 0x9F }, /* latin capital letter y with diaeresis */
+ { 0x17D, 0x8E }, /* latin capital letter z with caron */
+ { 0x17E, 0x9E }, /* latin small letter z with caron */
+ { 0x192, 0x83 }, /* latin small letter f with hook */
+ { 0x2C6, 0x88 }, /* modifier letter circumflex accent */
+ { 0x2DC, 0x98 }, /* small tilde */
+ { 0x2013, 0x96 }, /* en dash */
+ { 0x2014, 0x97 }, /* em dash */
+ { 0x2018, 0x91 }, /* left single quotation mark */
+ { 0x2019, 0x92 }, /* right single quotation mark */
+ { 0x201A, 0x82 }, /* single low-9 quotation mark */
+ { 0x201C, 0x93 }, /* left double quotation mark */
+ { 0x201D, 0x94 }, /* right double quotation mark */
+ { 0x201E, 0x84 }, /* double low-9 quotation mark */
+ { 0x2020, 0x86 }, /* dagger */
+ { 0x2021, 0x87 }, /* double dagger */
+ { 0x2022, 0x95 }, /* bullet */
+ { 0x2026, 0x85 }, /* horizontal ellipsis */
+ { 0x2030, 0x89 }, /* per mille sign */
+ { 0x2039, 0x8B }, /* single left-pointing angle quotation mark */
+ { 0x203A, 0x9B }, /* single right-pointing angle quotation mark */
+ { 0x20AC, 0x80 }, /* euro sign */
+ { 0x2122, 0x99 }, /* trade mark sign */
+};
+
+static const unicode_mapping unimap_win1251[] = {
+ { 0xA0, 0xA0 }, /* no-break space */
+ { 0xA4, 0xA4 }, /* currency sign */
+ { 0xA6, 0xA6 }, /* broken bar */
+ { 0xA7, 0xA7 }, /* section sign */
+ { 0xA9, 0xA9 }, /* copyright sign */
+ { 0xAB, 0xAB }, /* left-pointing double angle quotation mark */
+ { 0xAC, 0xAC }, /* not sign */
+ { 0xAD, 0xAD }, /* soft hyphen */
+ { 0xAE, 0xAE }, /* registered sign */
+ { 0xB0, 0xB0 }, /* degree sign */
+ { 0xB1, 0xB1 }, /* plus-minus sign */
+ { 0xB5, 0xB5 }, /* micro sign */
+ { 0xB6, 0xB6 }, /* pilcrow sign */
+ { 0xB7, 0xB7 }, /* middle dot */
+ { 0xBB, 0xBB }, /* right-pointing double angle quotation mark */
+ { 0x401, 0xA8 }, /* cyrillic capital letter io */
+ { 0x402, 0x80 }, /* cyrillic capital letter dje */
+ { 0x403, 0x81 }, /* cyrillic capital letter gje */
+ { 0x404, 0xAA }, /* cyrillic capital letter ukrainian ie */
+ { 0x405, 0xBD }, /* cyrillic capital letter dze */
+ { 0x406, 0xB2 }, /* cyrillic capital letter byelorussian-ukrainian i */
+ { 0x407, 0xAF }, /* cyrillic capital letter yi */
+ { 0x408, 0xA3 }, /* cyrillic capital letter je */
+ { 0x409, 0x8A }, /* cyrillic capital letter lje */
+ { 0x40A, 0x8C }, /* cyrillic capital letter nje */
+ { 0x40B, 0x8E }, /* cyrillic capital letter tshe */
+ { 0x40C, 0x8D }, /* cyrillic capital letter kje */
+ { 0x40E, 0xA1 }, /* cyrillic capital letter short u */
+ { 0x40F, 0x8F }, /* cyrillic capital letter dzhe */
+ { 0x410, 0xC0 }, /* cyrillic capital letter a */
+ { 0x411, 0xC1 }, /* cyrillic capital letter be */
+ { 0x412, 0xC2 }, /* cyrillic capital letter ve */
+ { 0x413, 0xC3 }, /* cyrillic capital letter ghe */
+ { 0x414, 0xC4 }, /* cyrillic capital letter de */
+ { 0x415, 0xC5 }, /* cyrillic capital letter ie */
+ { 0x416, 0xC6 }, /* cyrillic capital letter zhe */
+ { 0x417, 0xC7 }, /* cyrillic capital letter ze */
+ { 0x418, 0xC8 }, /* cyrillic capital letter i */
+ { 0x419, 0xC9 }, /* cyrillic capital letter short i */
+ { 0x41A, 0xCA }, /* cyrillic capital letter ka */
+ { 0x41B, 0xCB }, /* cyrillic capital letter el */
+ { 0x41C, 0xCC }, /* cyrillic capital letter em */
+ { 0x41D, 0xCD }, /* cyrillic capital letter en */
+ { 0x41E, 0xCE }, /* cyrillic capital letter o */
+ { 0x41F, 0xCF }, /* cyrillic capital letter pe */
+ { 0x420, 0xD0 }, /* cyrillic capital letter er */
+ { 0x421, 0xD1 }, /* cyrillic capital letter es */
+ { 0x422, 0xD2 }, /* cyrillic capital letter te */
+ { 0x423, 0xD3 }, /* cyrillic capital letter u */
+ { 0x424, 0xD4 }, /* cyrillic capital letter ef */
+ { 0x425, 0xD5 }, /* cyrillic capital letter ha */
+ { 0x426, 0xD6 }, /* cyrillic capital letter tse */
+ { 0x427, 0xD7 }, /* cyrillic capital letter che */
+ { 0x428, 0xD8 }, /* cyrillic capital letter sha */
+ { 0x429, 0xD9 }, /* cyrillic capital letter shcha */
+ { 0x42A, 0xDA }, /* cyrillic capital letter hard sign */
+ { 0x42B, 0xDB }, /* cyrillic capital letter yeru */
+ { 0x42C, 0xDC }, /* cyrillic capital letter soft sign */
+ { 0x42D, 0xDD }, /* cyrillic capital letter e */
+ { 0x42E, 0xDE }, /* cyrillic capital letter yu */
+ { 0x42F, 0xDF }, /* cyrillic capital letter ya */
+ { 0x430, 0xE0 }, /* cyrillic small letter a */
+ { 0x431, 0xE1 }, /* cyrillic small letter be */
+ { 0x432, 0xE2 }, /* cyrillic small letter ve */
+ { 0x433, 0xE3 }, /* cyrillic small letter ghe */
+ { 0x434, 0xE4 }, /* cyrillic small letter de */
+ { 0x435, 0xE5 }, /* cyrillic small letter ie */
+ { 0x436, 0xE6 }, /* cyrillic small letter zhe */
+ { 0x437, 0xE7 }, /* cyrillic small letter ze */
+ { 0x438, 0xE8 }, /* cyrillic small letter i */
+ { 0x439, 0xE9 }, /* cyrillic small letter short i */
+ { 0x43A, 0xEA }, /* cyrillic small letter ka */
+ { 0x43B, 0xEB }, /* cyrillic small letter el */
+ { 0x43C, 0xEC }, /* cyrillic small letter em */
+ { 0x43D, 0xED }, /* cyrillic small letter en */
+ { 0x43E, 0xEE }, /* cyrillic small letter o */
+ { 0x43F, 0xEF }, /* cyrillic small letter pe */
+ { 0x440, 0xF0 }, /* cyrillic small letter er */
+ { 0x441, 0xF1 }, /* cyrillic small letter es */
+ { 0x442, 0xF2 }, /* cyrillic small letter te */
+ { 0x443, 0xF3 }, /* cyrillic small letter u */
+ { 0x444, 0xF4 }, /* cyrillic small letter ef */
+ { 0x445, 0xF5 }, /* cyrillic small letter ha */
+ { 0x446, 0xF6 }, /* cyrillic small letter tse */
+ { 0x447, 0xF7 }, /* cyrillic small letter che */
+ { 0x448, 0xF8 }, /* cyrillic small letter sha */
+ { 0x449, 0xF9 }, /* cyrillic small letter shcha */
+ { 0x44A, 0xFA }, /* cyrillic small letter hard sign */
+ { 0x44B, 0xFB }, /* cyrillic small letter yeru */
+ { 0x44C, 0xFC }, /* cyrillic small letter soft sign */
+ { 0x44D, 0xFD }, /* cyrillic small letter e */
+ { 0x44E, 0xFE }, /* cyrillic small letter yu */
+ { 0x44F, 0xFF }, /* cyrillic small letter ya */
+ { 0x451, 0xB8 }, /* cyrillic small letter io */
+ { 0x452, 0x90 }, /* cyrillic small letter dje */
+ { 0x453, 0x83 }, /* cyrillic small letter gje */
+ { 0x454, 0xBA }, /* cyrillic small letter ukrainian ie */
+ { 0x455, 0xBE }, /* cyrillic small letter dze */
+ { 0x456, 0xB3 }, /* cyrillic small letter byelorussian-ukrainian i */
+ { 0x457, 0xBF }, /* cyrillic small letter yi */
+ { 0x458, 0xBC }, /* cyrillic small letter je */
+ { 0x459, 0x9A }, /* cyrillic small letter lje */
+ { 0x45A, 0x9C }, /* cyrillic small letter nje */
+ { 0x45B, 0x9E }, /* cyrillic small letter tshe */
+ { 0x45C, 0x9D }, /* cyrillic small letter kje */
+ { 0x45E, 0xA2 }, /* cyrillic small letter short u */
+ { 0x45F, 0x9F }, /* cyrillic small letter dzhe */
+ { 0x490, 0xA5 }, /* cyrillic capital letter ghe with upturn */
+ { 0x491, 0xB4 }, /* cyrillic small letter ghe with upturn */
+ { 0x2013, 0x96 }, /* en dash */
+ { 0x2014, 0x97 }, /* em dash */
+ { 0x2018, 0x91 }, /* left single quotation mark */
+ { 0x2019, 0x92 }, /* right single quotation mark */
+ { 0x201A, 0x82 }, /* single low-9 quotation mark */
+ { 0x201C, 0x93 }, /* left double quotation mark */
+ { 0x201D, 0x94 }, /* right double quotation mark */
+ { 0x201E, 0x84 }, /* double low-9 quotation mark */
+ { 0x2020, 0x86 }, /* dagger */
+ { 0x2021, 0x87 }, /* double dagger */
+ { 0x2022, 0x95 }, /* bullet */
+ { 0x2026, 0x85 }, /* horizontal ellipsis */
+ { 0x2030, 0x89 }, /* per mille sign */
+ { 0x2039, 0x8B }, /* single left-pointing angle quotation mark */
+ { 0x203A, 0x9B }, /* single right-pointing angle quotation mark */
+ { 0x20AC, 0x88 }, /* euro sign */
+ { 0x2116, 0xB9 }, /* numero sign */
+ { 0x2122, 0x99 }, /* trade mark sign */
+};
+
+static const unicode_mapping unimap_koi8r[] = {
+ { 0xA0, 0x9A }, /* no-break space */
+ { 0xA9, 0xBF }, /* copyright sign */
+ { 0xB0, 0x9C }, /* degree sign */
+ { 0xB2, 0x9D }, /* superscript two */
+ { 0xB7, 0x9E }, /* middle dot */
+ { 0xF7, 0x9F }, /* division sign */
+ { 0x401, 0xB3 }, /* cyrillic capital letter io */
+ { 0x410, 0xE1 }, /* cyrillic capital letter a */
+ { 0x411, 0xE2 }, /* cyrillic capital letter be */
+ { 0x412, 0xF7 }, /* cyrillic capital letter ve */
+ { 0x413, 0xE7 }, /* cyrillic capital letter ghe */
+ { 0x414, 0xE4 }, /* cyrillic capital letter de */
+ { 0x415, 0xE5 }, /* cyrillic capital letter ie */
+ { 0x416, 0xF6 }, /* cyrillic capital letter zhe */
+ { 0x417, 0xFA }, /* cyrillic capital letter ze */
+ { 0x418, 0xE9 }, /* cyrillic capital letter i */
+ { 0x419, 0xEA }, /* cyrillic capital letter short i */
+ { 0x41A, 0xEB }, /* cyrillic capital letter ka */
+ { 0x41B, 0xEC }, /* cyrillic capital letter el */
+ { 0x41C, 0xED }, /* cyrillic capital letter em */
+ { 0x41D, 0xEE }, /* cyrillic capital letter en */
+ { 0x41E, 0xEF }, /* cyrillic capital letter o */
+ { 0x41F, 0xF0 }, /* cyrillic capital letter pe */
+ { 0x420, 0xF2 }, /* cyrillic capital letter er */
+ { 0x421, 0xF3 }, /* cyrillic capital letter es */
+ { 0x422, 0xF4 }, /* cyrillic capital letter te */
+ { 0x423, 0xF5 }, /* cyrillic capital letter u */
+ { 0x424, 0xE6 }, /* cyrillic capital letter ef */
+ { 0x425, 0xE8 }, /* cyrillic capital letter ha */
+ { 0x426, 0xE3 }, /* cyrillic capital letter tse */
+ { 0x427, 0xFE }, /* cyrillic capital letter che */
+ { 0x428, 0xFB }, /* cyrillic capital letter sha */
+ { 0x429, 0xFD }, /* cyrillic capital letter shcha */
+ { 0x42A, 0xFF }, /* cyrillic capital letter hard sign */
+ { 0x42B, 0xF9 }, /* cyrillic capital letter yeru */
+ { 0x42C, 0xF8 }, /* cyrillic capital letter soft sign */
+ { 0x42D, 0xFC }, /* cyrillic capital letter e */
+ { 0x42E, 0xE0 }, /* cyrillic capital letter yu */
+ { 0x42F, 0xF1 }, /* cyrillic capital letter ya */
+ { 0x430, 0xC1 }, /* cyrillic small letter a */
+ { 0x431, 0xC2 }, /* cyrillic small letter be */
+ { 0x432, 0xD7 }, /* cyrillic small letter ve */
+ { 0x433, 0xC7 }, /* cyrillic small letter ghe */
+ { 0x434, 0xC4 }, /* cyrillic small letter de */
+ { 0x435, 0xC5 }, /* cyrillic small letter ie */
+ { 0x436, 0xD6 }, /* cyrillic small letter zhe */
+ { 0x437, 0xDA }, /* cyrillic small letter ze */
+ { 0x438, 0xC9 }, /* cyrillic small letter i */
+ { 0x439, 0xCA }, /* cyrillic small letter short i */
+ { 0x43A, 0xCB }, /* cyrillic small letter ka */
+ { 0x43B, 0xCC }, /* cyrillic small letter el */
+ { 0x43C, 0xCD }, /* cyrillic small letter em */
+ { 0x43D, 0xCE }, /* cyrillic small letter en */
+ { 0x43E, 0xCF }, /* cyrillic small letter o */
+ { 0x43F, 0xD0 }, /* cyrillic small letter pe */
+ { 0x440, 0xD2 }, /* cyrillic small letter er */
+ { 0x441, 0xD3 }, /* cyrillic small letter es */
+ { 0x442, 0xD4 }, /* cyrillic small letter te */
+ { 0x443, 0xD5 }, /* cyrillic small letter u */
+ { 0x444, 0xC6 }, /* cyrillic small letter ef */
+ { 0x445, 0xC8 }, /* cyrillic small letter ha */
+ { 0x446, 0xC3 }, /* cyrillic small letter tse */
+ { 0x447, 0xDE }, /* cyrillic small letter che */
+ { 0x448, 0xDB }, /* cyrillic small letter sha */
+ { 0x449, 0xDD }, /* cyrillic small letter shcha */
+ { 0x44A, 0xDF }, /* cyrillic small letter hard sign */
+ { 0x44B, 0xD9 }, /* cyrillic small letter yeru */
+ { 0x44C, 0xD8 }, /* cyrillic small letter soft sign */
+ { 0x44D, 0xDC }, /* cyrillic small letter e */
+ { 0x44E, 0xC0 }, /* cyrillic small letter yu */
+ { 0x44F, 0xD1 }, /* cyrillic small letter ya */
+ { 0x451, 0xA3 }, /* cyrillic small letter io */
+ { 0x2219, 0x95 }, /* bullet operator */
+ { 0x221A, 0x96 }, /* square root */
+ { 0x2248, 0x97 }, /* almost equal to */
+ { 0x2264, 0x98 }, /* less-than or equal to */
+ { 0x2265, 0x99 }, /* greater-than or equal to */
+ { 0x2320, 0x93 }, /* top half integral */
+ { 0x2321, 0x9B }, /* bottom half integral */
+ { 0x2500, 0x80 }, /* box drawings light horizontal */
+ { 0x2502, 0x81 }, /* box drawings light vertical */
+ { 0x250C, 0x82 }, /* box drawings light down and right */
+ { 0x2510, 0x83 }, /* box drawings light down and left */
+ { 0x2514, 0x84 }, /* box drawings light up and right */
+ { 0x2518, 0x85 }, /* box drawings light up and left */
+ { 0x251C, 0x86 }, /* box drawings light vertical and right */
+ { 0x2524, 0x87 }, /* box drawings light vertical and left */
+ { 0x252C, 0x88 }, /* box drawings light down and horizontal */
+ { 0x2534, 0x89 }, /* box drawings light up and horizontal */
+ { 0x253C, 0x8A }, /* box drawings light vertical and horizontal */
+ { 0x2550, 0xA0 }, /* box drawings double horizontal */
+ { 0x2551, 0xA1 }, /* box drawings double vertical */
+ { 0x2552, 0xA2 }, /* box drawings down single and right double */
+ { 0x2553, 0xA4 }, /* box drawings down double and right single */
+ { 0x2554, 0xA5 }, /* box drawings double down and right */
+ { 0x2555, 0xA6 }, /* box drawings down single and left double */
+ { 0x2556, 0xA7 }, /* box drawings down double and left single */
+ { 0x2557, 0xA8 }, /* box drawings double down and left */
+ { 0x2558, 0xA9 }, /* box drawings up single and right double */
+ { 0x2559, 0xAA }, /* box drawings up double and right single */
+ { 0x255A, 0xAB }, /* box drawings double up and right */
+ { 0x255B, 0xAC }, /* box drawings up single and left double */
+ { 0x255C, 0xAD }, /* box drawings up double and left single */
+ { 0x255D, 0xAE }, /* box drawings double up and left */
+ { 0x255E, 0xAF }, /* box drawings vertical single and right double */
+ { 0x255F, 0xB0 }, /* box drawings vertical double and right single */
+ { 0x2560, 0xB1 }, /* box drawings double vertical and right */
+ { 0x2561, 0xB2 }, /* box drawings vertical single and left double */
+ { 0x2562, 0xB4 }, /* box drawings vertical double and left single */
+ { 0x2563, 0xB5 }, /* box drawings double vertical and left */
+ { 0x2564, 0xB6 }, /* box drawings down single and horizontal double */
+ { 0x2565, 0xB7 }, /* box drawings down double and horizontal single */
+ { 0x2566, 0xB8 }, /* box drawings double down and horizontal */
+ { 0x2567, 0xB9 }, /* box drawings up single and horizontal double */
+ { 0x2568, 0xBA }, /* box drawings up double and horizontal single */
+ { 0x2569, 0xBB }, /* box drawings double up and horizontal */
+ { 0x256A, 0xBC }, /* box drawings vertical single and horizontal double */
+ { 0x256B, 0xBD }, /* box drawings vertical double and horizontal single */
+ { 0x256C, 0xBE }, /* box drawings double vertical and horizontal */
+ { 0x2580, 0x8B }, /* upper half block */
+ { 0x2584, 0x8C }, /* lower half block */
+ { 0x2588, 0x8D }, /* full block */
+ { 0x258C, 0x8E }, /* left half block */
+ { 0x2590, 0x8F }, /* right half block */
+ { 0x2591, 0x90 }, /* light shade */
+ { 0x2592, 0x91 }, /* medium shade */
+ { 0x2593, 0x92 }, /* dark shade */
+ { 0x25A0, 0x94 }, /* black square */
+};
+
+static const unicode_mapping unimap_cp866[] = {
+ { 0xA0, 0xFF }, /* no-break space */
+ { 0xA4, 0xFD }, /* currency sign */
+ { 0xB0, 0xF8 }, /* degree sign */
+ { 0xB7, 0xFA }, /* middle dot */
+ { 0x401, 0xF0 }, /* cyrillic capital letter io */
+ { 0x404, 0xF2 }, /* cyrillic capital letter ukrainian ie */
+ { 0x407, 0xF4 }, /* cyrillic capital letter yi */
+ { 0x40E, 0xF6 }, /* cyrillic capital letter short u */
+ { 0x410, 0x80 }, /* cyrillic capital letter a */
+ { 0x411, 0x81 }, /* cyrillic capital letter be */
+ { 0x412, 0x82 }, /* cyrillic capital letter ve */
+ { 0x413, 0x83 }, /* cyrillic capital letter ghe */
+ { 0x414, 0x84 }, /* cyrillic capital letter de */
+ { 0x415, 0x85 }, /* cyrillic capital letter ie */
+ { 0x416, 0x86 }, /* cyrillic capital letter zhe */
+ { 0x417, 0x87 }, /* cyrillic capital letter ze */
+ { 0x418, 0x88 }, /* cyrillic capital letter i */
+ { 0x419, 0x89 }, /* cyrillic capital letter short i */
+ { 0x41A, 0x8A }, /* cyrillic capital letter ka */
+ { 0x41B, 0x8B }, /* cyrillic capital letter el */
+ { 0x41C, 0x8C }, /* cyrillic capital letter em */
+ { 0x41D, 0x8D }, /* cyrillic capital letter en */
+ { 0x41E, 0x8E }, /* cyrillic capital letter o */
+ { 0x41F, 0x8F }, /* cyrillic capital letter pe */
+ { 0x420, 0x90 }, /* cyrillic capital letter er */
+ { 0x421, 0x91 }, /* cyrillic capital letter es */
+ { 0x422, 0x92 }, /* cyrillic capital letter te */
+ { 0x423, 0x93 }, /* cyrillic capital letter u */
+ { 0x424, 0x94 }, /* cyrillic capital letter ef */
+ { 0x425, 0x95 }, /* cyrillic capital letter ha */
+ { 0x426, 0x96 }, /* cyrillic capital letter tse */
+ { 0x427, 0x97 }, /* cyrillic capital letter che */
+ { 0x428, 0x98 }, /* cyrillic capital letter sha */
+ { 0x429, 0x99 }, /* cyrillic capital letter shcha */
+ { 0x42A, 0x9A }, /* cyrillic capital letter hard sign */
+ { 0x42B, 0x9B }, /* cyrillic capital letter yeru */
+ { 0x42C, 0x9C }, /* cyrillic capital letter soft sign */
+ { 0x42D, 0x9D }, /* cyrillic capital letter e */
+ { 0x42E, 0x9F }, /* cyrillic capital letter ya */
+ { 0x430, 0xA0 }, /* cyrillic small letter a */
+ { 0x431, 0xA1 }, /* cyrillic small letter be */
+ { 0x432, 0xA2 }, /* cyrillic small letter ve */
+ { 0x433, 0xA3 }, /* cyrillic small letter ghe */
+ { 0x434, 0xA4 }, /* cyrillic small letter de */
+ { 0x435, 0xA5 }, /* cyrillic small letter ie */
+ { 0x436, 0xA6 }, /* cyrillic small letter zhe */
+ { 0x437, 0xA7 }, /* cyrillic small letter ze */
+ { 0x438, 0xA8 }, /* cyrillic small letter i */
+ { 0x439, 0xA9 }, /* cyrillic small letter short i */
+ { 0x43A, 0xAA }, /* cyrillic small letter ka */
+ { 0x43B, 0xAB }, /* cyrillic small letter el */
+ { 0x43C, 0xAC }, /* cyrillic small letter em */
+ { 0x43D, 0xAD }, /* cyrillic small letter en */
+ { 0x43E, 0xAE }, /* cyrillic small letter o */
+ { 0x43F, 0xAF }, /* cyrillic small letter pe */
+ { 0x440, 0xE0 }, /* cyrillic small letter er */
+ { 0x441, 0xE1 }, /* cyrillic small letter es */
+ { 0x442, 0xE2 }, /* cyrillic small letter te */
+ { 0x443, 0xE3 }, /* cyrillic small letter u */
+ { 0x444, 0xE4 }, /* cyrillic small letter ef */
+ { 0x445, 0xE5 }, /* cyrillic small letter ha */
+ { 0x446, 0xE6 }, /* cyrillic small letter tse */
+ { 0x447, 0xE7 }, /* cyrillic small letter che */
+ { 0x448, 0xE8 }, /* cyrillic small letter sha */
+ { 0x449, 0xE9 }, /* cyrillic small letter shcha */
+ { 0x44A, 0xEA }, /* cyrillic small letter hard sign */
+ { 0x44B, 0xEB }, /* cyrillic small letter yeru */
+ { 0x44C, 0xEC }, /* cyrillic small letter soft sign */
+ { 0x44D, 0xED }, /* cyrillic small letter e */
+ { 0x44E, 0xEE }, /* cyrillic small letter yu */
+ { 0x44F, 0xEF }, /* cyrillic small letter ya */
+ { 0x451, 0xF1 }, /* cyrillic small letter io */
+ { 0x454, 0xF3 }, /* cyrillic small letter ukrainian ie */
+ { 0x457, 0xF5 }, /* cyrillic small letter yi */
+ { 0x45E, 0xF7 }, /* cyrillic small letter short u */
+ { 0x2116, 0xFC }, /* numero sign */
+ { 0x2219, 0xF9 }, /* bullet operator */
+ { 0x221A, 0xFB }, /* square root */
+ { 0x2500, 0xC4 }, /* box drawings light horizontal */
+ { 0x2502, 0xB3 }, /* box drawings light vertical */
+ { 0x250C, 0xDA }, /* box drawings light down and right */
+ { 0x2510, 0xBF }, /* box drawings light down and left */
+ { 0x2514, 0xC0 }, /* box drawings light up and right */
+ { 0x2518, 0xD9 }, /* box drawings light up and left */
+ { 0x251C, 0xC3 }, /* box drawings light vertical and right */
+ { 0x2524, 0xB4 }, /* box drawings light vertical and left */
+ { 0x252C, 0xC2 }, /* box drawings light down and horizontal */
+ { 0x2534, 0xC1 }, /* box drawings light up and horizontal */
+ { 0x253C, 0xC5 }, /* box drawings light vertical and horizontal */
+ { 0x2550, 0xCD }, /* box drawings double horizontal */
+ { 0x2551, 0xBA }, /* box drawings double vertical */
+ { 0x2552, 0xD5 }, /* box drawings down single and right double */
+ { 0x2553, 0xD6 }, /* box drawings down double and right single */
+ { 0x2554, 0xC9 }, /* box drawings double down and right */
+ { 0x2555, 0xB8 }, /* box drawings down single and left double */
+ { 0x2556, 0xB7 }, /* box drawings down double and left single */
+ { 0x2557, 0xBB }, /* box drawings double down and left */
+ { 0x2558, 0xD4 }, /* box drawings up single and right double */
+ { 0x2559, 0xD3 }, /* box drawings up double and right single */
+ { 0x255A, 0xC8 }, /* box drawings double up and right */
+ { 0x255B, 0xBE }, /* box drawings up single and left double */
+ { 0x255C, 0xBD }, /* box drawings up double and left single */
+ { 0x255D, 0xBC }, /* box drawings double up and left */
+ { 0x255E, 0xC6 }, /* box drawings vertical single and right double */
+ { 0x255F, 0xC7 }, /* box drawings vertical double and right single */
+ { 0x2560, 0xCC }, /* box drawings double vertical and right */
+ { 0x2561, 0xB5 }, /* box drawings vertical single and left double */
+ { 0x2562, 0xB6 }, /* box drawings vertical double and left single */
+ { 0x2563, 0xB9 }, /* box drawings double vertical and left */
+ { 0x2564, 0xD1 }, /* box drawings down single and horizontal double */
+ { 0x2565, 0xD2 }, /* box drawings down double and horizontal single */
+ { 0x2566, 0xCB }, /* box drawings double down and horizontal */
+ { 0x2567, 0xCF }, /* box drawings up single and horizontal double */
+ { 0x2568, 0xD0 }, /* box drawings up double and horizontal single */
+ { 0x2569, 0xCA }, /* box drawings double up and horizontal */
+ { 0x256A, 0xD8 }, /* box drawings vertical single and horizontal double */
+ { 0x256B, 0xD7 }, /* box drawings vertical double and horizontal single */
+ { 0x256C, 0xCE }, /* box drawings double vertical and horizontal */
+ { 0x2580, 0xDF }, /* upper half block */
+ { 0x2584, 0xDC }, /* lower half block */
+ { 0x2588, 0xDB }, /* full block */
+ { 0x258C, 0xDD }, /* left half block */
+ { 0x2590, 0xDE }, /* right half block */
+ { 0x2591, 0xB0 }, /* light shade */
+ { 0x2592, 0xB1 }, /* medium shade */
+ { 0x2593, 0xB2 }, /* dark shade */
+ { 0x25A0, 0xFE }, /* black square */
+};
+
+static const unicode_mapping unimap_macroman[] = {
+ { 0xA0, 0xCA }, /* no-break space */
+ { 0xA1, 0xC1 }, /* inverted exclamation mark */
+ { 0xA2, 0xA2 }, /* cent sign */
+ { 0xA3, 0xA3 }, /* pound sign */
+ { 0xA5, 0xB4 }, /* yen sign */
+ { 0xA7, 0xA4 }, /* section sign */
+ { 0xA8, 0xAC }, /* diaeresis */
+ { 0xA9, 0xA9 }, /* copyright sign */
+ { 0xAA, 0xBB }, /* feminine ordinal indicator */
+ { 0xAB, 0xC7 }, /* left-pointing double angle quotation mark */
+ { 0xAC, 0xC2 }, /* not sign */
+ { 0xAE, 0xA8 }, /* registered sign */
+ { 0xAF, 0xF8 }, /* macron */
+ { 0xB0, 0xA1 }, /* degree sign */
+ { 0xB1, 0xB1 }, /* plus-minus sign */
+ { 0xB4, 0xAB }, /* acute accent */
+ { 0xB5, 0xB5 }, /* micro sign */
+ { 0xB6, 0xA6 }, /* pilcrow sign */
+ { 0xB7, 0xE1 }, /* middle dot */
+ { 0xB8, 0xFC }, /* cedilla */
+ { 0xBA, 0xBC }, /* masculine ordinal indicator */
+ { 0xBB, 0xC8 }, /* right-pointing double angle quotation mark */
+ { 0xBF, 0xC0 }, /* inverted question mark */
+ { 0xC0, 0xCB }, /* latin capital letter a with grave */
+ { 0xC1, 0xE7 }, /* latin capital letter a with acute */
+ { 0xC2, 0xE5 }, /* latin capital letter a with circumflex */
+ { 0xC3, 0xCC }, /* latin capital letter a with tilde */
+ { 0xC4, 0x80 }, /* latin capital letter a with diaeresis */
+ { 0xC5, 0x81 }, /* latin capital letter a with ring above */
+ { 0xC6, 0xAE }, /* latin capital letter ae */
+ { 0xC7, 0x82 }, /* latin capital letter c with cedilla */
+ { 0xC8, 0xE9 }, /* latin capital letter e with grave */
+ { 0xC9, 0x83 }, /* latin capital letter e with acute */
+ { 0xCA, 0xE6 }, /* latin capital letter e with circumflex */
+ { 0xCB, 0xE8 }, /* latin capital letter e with diaeresis */
+ { 0xCC, 0xED }, /* latin capital letter i with grave */
+ { 0xCD, 0xEA }, /* latin capital letter i with acute */
+ { 0xCE, 0xEB }, /* latin capital letter i with circumflex */
+ { 0xCF, 0xEC }, /* latin capital letter i with diaeresis */
+ { 0xD1, 0x84 }, /* latin capital letter n with tilde */
+ { 0xD2, 0xF1 }, /* latin capital letter o with grave */
+ { 0xD3, 0xEE }, /* latin capital letter o with acute */
+ { 0xD4, 0xEF }, /* latin capital letter o with circumflex */
+ { 0xD5, 0xCD }, /* latin capital letter o with tilde */
+ { 0xD6, 0x85 }, /* latin capital letter o with diaeresis */
+ { 0xD8, 0xAF }, /* latin capital letter o with stroke */
+ { 0xD9, 0xF4 }, /* latin capital letter u with grave */
+ { 0xDA, 0xF2 }, /* latin capital letter u with acute */
+ { 0xDB, 0xF3 }, /* latin capital letter u with circumflex */
+ { 0xDC, 0x86 }, /* latin capital letter u with diaeresis */
+ { 0xDF, 0xA7 }, /* latin small letter sharp s */
+ { 0xE0, 0x88 }, /* latin small letter a with grave */
+ { 0xE1, 0x87 }, /* latin small letter a with acute */
+ { 0xE2, 0x89 }, /* latin small letter a with circumflex */
+ { 0xE3, 0x8B }, /* latin small letter a with tilde */
+ { 0xE4, 0x8A }, /* latin small letter a with diaeresis */
+ { 0xE5, 0x8C }, /* latin small letter a with ring above */
+ { 0xE6, 0xBE }, /* latin small letter ae */
+ { 0xE7, 0x8D }, /* latin small letter c with cedilla */
+ { 0xE8, 0x8F }, /* latin small letter e with grave */
+ { 0xE9, 0x8E }, /* latin small letter e with acute */
+ { 0xEA, 0x90 }, /* latin small letter e with circumflex */
+ { 0xEB, 0x91 }, /* latin small letter e with diaeresis */
+ { 0xEC, 0x93 }, /* latin small letter i with grave */
+ { 0xED, 0x92 }, /* latin small letter i with acute */
+ { 0xEE, 0x94 }, /* latin small letter i with circumflex */
+ { 0xEF, 0x95 }, /* latin small letter i with diaeresis */
+ { 0xF1, 0x96 }, /* latin small letter n with tilde */
+ { 0xF2, 0x98 }, /* latin small letter o with grave */
+ { 0xF3, 0x97 }, /* latin small letter o with acute */
+ { 0xF4, 0x99 }, /* latin small letter o with circumflex */
+ { 0xF5, 0x9B }, /* latin small letter o with tilde */
+ { 0xF6, 0x9A }, /* latin small letter o with diaeresis */
+ { 0xF7, 0xD6 }, /* division sign */
+ { 0xF8, 0xBF }, /* latin small letter o with stroke */
+ { 0xF9, 0x9D }, /* latin small letter u with grave */
+ { 0xFA, 0x9C }, /* latin small letter u with acute */
+ { 0xFB, 0x9E }, /* latin small letter u with circumflex */
+ { 0xFC, 0x9F }, /* latin small letter u with diaeresis */
+ { 0xFF, 0xD8 }, /* latin small letter y with diaeresis */
+ { 0x131, 0xF5 }, /* latin small letter dotless i */
+ { 0x152, 0xCE }, /* latin capital ligature oe */
+ { 0x153, 0xCF }, /* latin small ligature oe */
+ { 0x178, 0xD9 }, /* latin capital letter y with diaeresis */
+ { 0x192, 0xC4 }, /* latin small letter f with hook */
+ { 0x2C6, 0xF6 }, /* modifier letter circumflex accent */
+ { 0x2C7, 0xFF }, /* caron */
+ { 0x2D8, 0xF9 }, /* breve */
+ { 0x2D9, 0xFA }, /* dot above */
+ { 0x2DA, 0xFB }, /* ring above */
+ { 0x2DB, 0xFE }, /* ogonek */
+ { 0x2DC, 0xF7 }, /* small tilde */
+ { 0x2DD, 0xFD }, /* double acute accent */
+ { 0x3A9, 0xBD }, /* greek capital letter omega */
+ { 0x3C0, 0xB9 }, /* greek small letter pi */
+ { 0x2013, 0xD0 }, /* en dash */
+ { 0x2014, 0xD1 }, /* em dash */
+ { 0x2018, 0xD4 }, /* left single quotation mark */
+ { 0x2019, 0xD5 }, /* right single quotation mark */
+ { 0x201A, 0xE2 }, /* single low-9 quotation mark */
+ { 0x201C, 0xD2 }, /* left double quotation mark */
+ { 0x201D, 0xD3 }, /* right double quotation mark */
+ { 0x201E, 0xE3 }, /* double low-9 quotation mark */
+ { 0x2020, 0xA0 }, /* dagger */
+ { 0x2021, 0xE0 }, /* double dagger */
+ { 0x2022, 0xA5 }, /* bullet */
+ { 0x2026, 0xC9 }, /* horizontal ellipsis */
+ { 0x2030, 0xE4 }, /* per mille sign */
+ { 0x2039, 0xDC }, /* single left-pointing angle quotation mark */
+ { 0x203A, 0xDD }, /* single right-pointing angle quotation mark */
+ { 0x2044, 0xDA }, /* fraction slash */
+ { 0x20AC, 0xDB }, /* euro sign */
+ { 0x2122, 0xAA }, /* trade mark sign */
+ { 0x2202, 0xB6 }, /* partial differential */
+ { 0x2206, 0xC6 }, /* increment */
+ { 0x220F, 0xB8 }, /* n-ary product */
+ { 0x2211, 0xB7 }, /* n-ary summation */
+ { 0x221A, 0xC3 }, /* square root */
+ { 0x221E, 0xB0 }, /* infinity */
+ { 0x222B, 0xBA }, /* integral */
+ { 0x2248, 0xC5 }, /* almost equal to */
+ { 0x2260, 0xAD }, /* not equal to */
+ { 0x2264, 0xB2 }, /* less-than or equal to */
+ { 0x2265, 0xB3 }, /* greater-than or equal to */
+ { 0x25CA, 0xD7 }, /* lozenge */
+ { 0xF8FF, 0xF0 }, /* apple logo */
+ { 0xFB01, 0xDE }, /* latin small ligature fi */
+ { 0xFB02, 0xDF }, /* latin small ligature fl */
+};
+
+#endif /* HTML_TABLES_H */
diff --git a/ext/standard/tests/strings/get_html_translation_table_basic1.phpt b/ext/standard/tests/strings/get_html_translation_table_basic1.phpt
index c09388335b..8b6c9afdaa 100644
--- a/ext/standard/tests/strings/get_html_translation_table_basic1.phpt
+++ b/ext/standard/tests/strings/get_html_translation_table_basic1.phpt
@@ -43,14 +43,14 @@ echo "Done\n";
*** Testing get_html_translation_table() : basic functionality ***
-- with default arguments --
array(4) {
+ ["&"]=>
+ string(5) "&amp;"
["""]=>
string(6) "&quot;"
["<"]=>
string(4) "&lt;"
[">"]=>
string(4) "&gt;"
- ["&"]=>
- string(5) "&amp;"
}
-- with table = HTML_ENTITIES --
array(171) {
@@ -400,13 +400,13 @@ array(171) {
}
-- with table = HTML_SPECIALCHARS --
array(4) {
+ ["&"]=>
+ string(5) "&amp;"
["""]=>
string(6) "&quot;"
["<"]=>
string(4) "&lt;"
[">"]=>
string(4) "&gt;"
- ["&"]=>
- string(5) "&amp;"
}
Done
diff --git a/ext/standard/tests/strings/html_entity_decode_cp866.phpt b/ext/standard/tests/strings/html_entity_decode_cp866.phpt
new file mode 100644
index 0000000000..94b23b6660
--- /dev/null
+++ b/ext/standard/tests/strings/html_entity_decode_cp866.phpt
@@ -0,0 +1,533 @@
+--TEST--
+Translation of HTML entities for encoding CP866
+--FILE--
+<?php
+$arr = array(
+0x0410 => array(0x80, "CYRILLIC CAPITAL LETTER A"),
+0x0411 => array(0x81, "CYRILLIC CAPITAL LETTER BE"),
+0x0412 => array(0x82, "CYRILLIC CAPITAL LETTER VE"),
+0x0413 => array(0x83, "CYRILLIC CAPITAL LETTER GHE"),
+0x0414 => array(0x84, "CYRILLIC CAPITAL LETTER DE"),
+0x0415 => array(0x85, "CYRILLIC CAPITAL LETTER IE"),
+0x0416 => array(0x86, "CYRILLIC CAPITAL LETTER ZHE"),
+0x0417 => array(0x87, "CYRILLIC CAPITAL LETTER ZE"),
+0x0418 => array(0x88, "CYRILLIC CAPITAL LETTER I"),
+0x0419 => array(0x89, "CYRILLIC CAPITAL LETTER SHORT I"),
+0x041a => array(0x8a, "CYRILLIC CAPITAL LETTER KA"),
+0x041b => array(0x8b, "CYRILLIC CAPITAL LETTER EL"),
+0x041c => array(0x8c, "CYRILLIC CAPITAL LETTER EM"),
+0x041d => array(0x8d, "CYRILLIC CAPITAL LETTER EN"),
+0x041e => array(0x8e, "CYRILLIC CAPITAL LETTER O"),
+0x041f => array(0x8f, "CYRILLIC CAPITAL LETTER PE"),
+0x0420 => array(0x90, "CYRILLIC CAPITAL LETTER ER"),
+0x0421 => array(0x91, "CYRILLIC CAPITAL LETTER ES"),
+0x0422 => array(0x92, "CYRILLIC CAPITAL LETTER TE"),
+0x0423 => array(0x93, "CYRILLIC CAPITAL LETTER U"),
+0x0424 => array(0x94, "CYRILLIC CAPITAL LETTER EF"),
+0x0425 => array(0x95, "CYRILLIC CAPITAL LETTER HA"),
+0x0426 => array(0x96, "CYRILLIC CAPITAL LETTER TSE"),
+0x0427 => array(0x97, "CYRILLIC CAPITAL LETTER CHE"),
+0x0428 => array(0x98, "CYRILLIC CAPITAL LETTER SHA"),
+0x0429 => array(0x99, "CYRILLIC CAPITAL LETTER SHCHA"),
+0x042a => array(0x9a, "CYRILLIC CAPITAL LETTER HARD SIGN"),
+0x042b => array(0x9b, "CYRILLIC CAPITAL LETTER YERU"),
+0x042c => array(0x9c, "CYRILLIC CAPITAL LETTER SOFT SIGN"),
+0x042d => array(0x9d, "CYRILLIC CAPITAL LETTER E"),
+0x042e => array(0x9e, "CYRILLIC CAPITAL LETTER YU"),
+0x042f => array(0x9f, "CYRILLIC CAPITAL LETTER YA"),
+0x0430 => array(0xa0, "CYRILLIC SMALL LETTER A"),
+0x0431 => array(0xa1, "CYRILLIC SMALL LETTER BE"),
+0x0432 => array(0xa2, "CYRILLIC SMALL LETTER VE"),
+0x0433 => array(0xa3, "CYRILLIC SMALL LETTER GHE"),
+0x0434 => array(0xa4, "CYRILLIC SMALL LETTER DE"),
+0x0435 => array(0xa5, "CYRILLIC SMALL LETTER IE"),
+0x0436 => array(0xa6, "CYRILLIC SMALL LETTER ZHE"),
+0x0437 => array(0xa7, "CYRILLIC SMALL LETTER ZE"),
+0x0438 => array(0xa8, "CYRILLIC SMALL LETTER I"),
+0x0439 => array(0xa9, "CYRILLIC SMALL LETTER SHORT I"),
+0x043a => array(0xaa, "CYRILLIC SMALL LETTER KA"),
+0x043b => array(0xab, "CYRILLIC SMALL LETTER EL"),
+0x043c => array(0xac, "CYRILLIC SMALL LETTER EM"),
+0x043d => array(0xad, "CYRILLIC SMALL LETTER EN"),
+0x043e => array(0xae, "CYRILLIC SMALL LETTER O"),
+0x043f => array(0xaf, "CYRILLIC SMALL LETTER PE"),
+0x2591 => array(0xb0, "LIGHT SHADE"),
+0x2592 => array(0xb1, "MEDIUM SHADE"),
+0x2593 => array(0xb2, "DARK SHADE"),
+0x2502 => array(0xb3, "BOX DRAWINGS LIGHT VERTICAL"),
+0x2524 => array(0xb4, "BOX DRAWINGS LIGHT VERTICAL AND LEFT"),
+0x2561 => array(0xb5, "BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE"),
+0x2562 => array(0xb6, "BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE"),
+0x2556 => array(0xb7, "BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE"),
+0x2555 => array(0xb8, "BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE"),
+0x2563 => array(0xb9, "BOX DRAWINGS DOUBLE VERTICAL AND LEFT"),
+0x2551 => array(0xba, "BOX DRAWINGS DOUBLE VERTICAL"),
+0x2557 => array(0xbb, "BOX DRAWINGS DOUBLE DOWN AND LEFT"),
+0x255d => array(0xbc, "BOX DRAWINGS DOUBLE UP AND LEFT"),
+0x255c => array(0xbd, "BOX DRAWINGS UP DOUBLE AND LEFT SINGLE"),
+0x255b => array(0xbe, "BOX DRAWINGS UP SINGLE AND LEFT DOUBLE"),
+0x2510 => array(0xbf, "BOX DRAWINGS LIGHT DOWN AND LEFT"),
+0x2514 => array(0xc0, "BOX DRAWINGS LIGHT UP AND RIGHT"),
+0x2534 => array(0xc1, "BOX DRAWINGS LIGHT UP AND HORIZONTAL"),
+0x252c => array(0xc2, "BOX DRAWINGS LIGHT DOWN AND HORIZONTAL"),
+0x251c => array(0xc3, "BOX DRAWINGS LIGHT VERTICAL AND RIGHT"),
+0x2500 => array(0xc4, "BOX DRAWINGS LIGHT HORIZONTAL"),
+0x253c => array(0xc5, "BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL"),
+0x255e => array(0xc6, "BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE"),
+0x255f => array(0xc7, "BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE"),
+0x255a => array(0xc8, "BOX DRAWINGS DOUBLE UP AND RIGHT"),
+0x2554 => array(0xc9, "BOX DRAWINGS DOUBLE DOWN AND RIGHT"),
+0x2569 => array(0xca, "BOX DRAWINGS DOUBLE UP AND HORIZONTAL"),
+0x2566 => array(0xcb, "BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL"),
+0x2560 => array(0xcc, "BOX DRAWINGS DOUBLE VERTICAL AND RIGHT"),
+0x2550 => array(0xcd, "BOX DRAWINGS DOUBLE HORIZONTAL"),
+0x256c => array(0xce, "BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL"),
+0x2567 => array(0xcf, "BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE"),
+0x2568 => array(0xd0, "BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE"),
+0x2564 => array(0xd1, "BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE"),
+0x2565 => array(0xd2, "BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE"),
+0x2559 => array(0xd3, "BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE"),
+0x2558 => array(0xd4, "BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE"),
+0x2552 => array(0xd5, "BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE"),
+0x2553 => array(0xd6, "BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE"),
+0x256b => array(0xd7, "BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE"),
+0x256a => array(0xd8, "BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE"),
+0x2518 => array(0xd9, "BOX DRAWINGS LIGHT UP AND LEFT"),
+0x250c => array(0xda, "BOX DRAWINGS LIGHT DOWN AND RIGHT"),
+0x2588 => array(0xdb, "FULL BLOCK"),
+0x2584 => array(0xdc, "LOWER HALF BLOCK"),
+0x258c => array(0xdd, "LEFT HALF BLOCK"),
+0x2590 => array(0xde, "RIGHT HALF BLOCK"),
+0x2580 => array(0xdf, "UPPER HALF BLOCK"),
+0x0440 => array(0xe0, "CYRILLIC SMALL LETTER ER"),
+0x0441 => array(0xe1, "CYRILLIC SMALL LETTER ES"),
+0x0442 => array(0xe2, "CYRILLIC SMALL LETTER TE"),
+0x0443 => array(0xe3, "CYRILLIC SMALL LETTER U"),
+0x0444 => array(0xe4, "CYRILLIC SMALL LETTER EF"),
+0x0445 => array(0xe5, "CYRILLIC SMALL LETTER HA"),
+0x0446 => array(0xe6, "CYRILLIC SMALL LETTER TSE"),
+0x0447 => array(0xe7, "CYRILLIC SMALL LETTER CHE"),
+0x0448 => array(0xe8, "CYRILLIC SMALL LETTER SHA"),
+0x0449 => array(0xe9, "CYRILLIC SMALL LETTER SHCHA"),
+0x044a => array(0xea, "CYRILLIC SMALL LETTER HARD SIGN"),
+0x044b => array(0xeb, "CYRILLIC SMALL LETTER YERU"),
+0x044c => array(0xec, "CYRILLIC SMALL LETTER SOFT SIGN"),
+0x044d => array(0xed, "CYRILLIC SMALL LETTER E"),
+0x044e => array(0xee, "CYRILLIC SMALL LETTER YU"),
+0x044f => array(0xef, "CYRILLIC SMALL LETTER YA"),
+0x0401 => array(0xf0, "CYRILLIC CAPITAL LETTER IO"),
+0x0451 => array(0xf1, "CYRILLIC SMALL LETTER IO"),
+0x0404 => array(0xf2, "CYRILLIC CAPITAL LETTER UKRAINIAN IE"),
+0x0454 => array(0xf3, "CYRILLIC SMALL LETTER UKRAINIAN IE"),
+0x0407 => array(0xf4, "CYRILLIC CAPITAL LETTER YI"),
+0x0457 => array(0xf5, "CYRILLIC SMALL LETTER YI"),
+0x040e => array(0xf6, "CYRILLIC CAPITAL LETTER SHORT U"),
+0x045e => array(0xf7, "CYRILLIC SMALL LETTER SHORT U"),
+0x00b0 => array(0xf8, "DEGREE SIGN"),
+0x2219 => array(0xf9, "BULLET OPERATOR"),
+0x00b7 => array(0xfa, "MIDDLE DOT"),
+0x221a => array(0xfb, "SQUARE ROOT"),
+0x2116 => array(0xfc, "NUMERO SIGN"),
+0x00a4 => array(0xfd, "CURRENCY SIGN"),
+0x25a0 => array(0xfe, "BLACK SQUARE"),
+0x00a0 => array(0xff, "NO-BREAK SPACE"),
+);
+
+foreach ($arr as $u => $v) {
+ $ent = sprintf("&#x%X;", $u);
+ $res = html_entity_decode($ent, ENT_QUOTES, 'CP866');
+ $d = unpack("H*", $res);
+ echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]);
+
+ $ent = sprintf("&#x%X;", $v[0]);
+ $res = html_entity_decode($ent, ENT_QUOTES, 'CP866');
+ if ($res[0] != "&" || $res[1] != "#")
+ $res = unpack("H*", $res)[1];
+ echo sprintf("%s => %s\n\n", $ent, $res);
+}
+--EXPECT--
+CYRILLIC CAPITAL LETTER A: &#x410; => 80
+&#x80; => &#x80;
+
+CYRILLIC CAPITAL LETTER BE: &#x411; => 81
+&#x81; => &#x81;
+
+CYRILLIC CAPITAL LETTER VE: &#x412; => 82
+&#x82; => &#x82;
+
+CYRILLIC CAPITAL LETTER GHE: &#x413; => 83
+&#x83; => &#x83;
+
+CYRILLIC CAPITAL LETTER DE: &#x414; => 84
+&#x84; => &#x84;
+
+CYRILLIC CAPITAL LETTER IE: &#x415; => 85
+&#x85; => &#x85;
+
+CYRILLIC CAPITAL LETTER ZHE: &#x416; => 86
+&#x86; => &#x86;
+
+CYRILLIC CAPITAL LETTER ZE: &#x417; => 87
+&#x87; => &#x87;
+
+CYRILLIC CAPITAL LETTER I: &#x418; => 88
+&#x88; => &#x88;
+
+CYRILLIC CAPITAL LETTER SHORT I: &#x419; => 89
+&#x89; => &#x89;
+
+CYRILLIC CAPITAL LETTER KA: &#x41A; => 8a
+&#x8A; => &#x8A;
+
+CYRILLIC CAPITAL LETTER EL: &#x41B; => 8b
+&#x8B; => &#x8B;
+
+CYRILLIC CAPITAL LETTER EM: &#x41C; => 8c
+&#x8C; => &#x8C;
+
+CYRILLIC CAPITAL LETTER EN: &#x41D; => 8d
+&#x8D; => &#x8D;
+
+CYRILLIC CAPITAL LETTER O: &#x41E; => 8e
+&#x8E; => &#x8E;
+
+CYRILLIC CAPITAL LETTER PE: &#x41F; => 8f
+&#x8F; => &#x8F;
+
+CYRILLIC CAPITAL LETTER ER: &#x420; => 90
+&#x90; => &#x90;
+
+CYRILLIC CAPITAL LETTER ES: &#x421; => 91
+&#x91; => &#x91;
+
+CYRILLIC CAPITAL LETTER TE: &#x422; => 92
+&#x92; => &#x92;
+
+CYRILLIC CAPITAL LETTER U: &#x423; => 93
+&#x93; => &#x93;
+
+CYRILLIC CAPITAL LETTER EF: &#x424; => 94
+&#x94; => &#x94;
+
+CYRILLIC CAPITAL LETTER HA: &#x425; => 95
+&#x95; => &#x95;
+
+CYRILLIC CAPITAL LETTER TSE: &#x426; => 96
+&#x96; => &#x96;
+
+CYRILLIC CAPITAL LETTER CHE: &#x427; => 97
+&#x97; => &#x97;
+
+CYRILLIC CAPITAL LETTER SHA: &#x428; => 98
+&#x98; => &#x98;
+
+CYRILLIC CAPITAL LETTER SHCHA: &#x429; => 99
+&#x99; => &#x99;
+
+CYRILLIC CAPITAL LETTER HARD SIGN: &#x42A; => 9a
+&#x9A; => &#x9A;
+
+CYRILLIC CAPITAL LETTER YERU: &#x42B; => 9b
+&#x9B; => &#x9B;
+
+CYRILLIC CAPITAL LETTER SOFT SIGN: &#x42C; => 9c
+&#x9C; => &#x9C;
+
+CYRILLIC CAPITAL LETTER E: &#x42D; => 9d
+&#x9D; => &#x9D;
+
+CYRILLIC CAPITAL LETTER YU: &#x42E; => 9f
+&#x9E; => &#x9E;
+
+CYRILLIC CAPITAL LETTER YA: &#x42F; => 2623783432463b
+&#x9F; => &#x9F;
+
+CYRILLIC SMALL LETTER A: &#x430; => a0
+&#xA0; => ff
+
+CYRILLIC SMALL LETTER BE: &#x431; => a1
+&#xA1; => &#xA1;
+
+CYRILLIC SMALL LETTER VE: &#x432; => a2
+&#xA2; => &#xA2;
+
+CYRILLIC SMALL LETTER GHE: &#x433; => a3
+&#xA3; => &#xA3;
+
+CYRILLIC SMALL LETTER DE: &#x434; => a4
+&#xA4; => fd
+
+CYRILLIC SMALL LETTER IE: &#x435; => a5
+&#xA5; => &#xA5;
+
+CYRILLIC SMALL LETTER ZHE: &#x436; => a6
+&#xA6; => &#xA6;
+
+CYRILLIC SMALL LETTER ZE: &#x437; => a7
+&#xA7; => &#xA7;
+
+CYRILLIC SMALL LETTER I: &#x438; => a8
+&#xA8; => &#xA8;
+
+CYRILLIC SMALL LETTER SHORT I: &#x439; => a9
+&#xA9; => &#xA9;
+
+CYRILLIC SMALL LETTER KA: &#x43A; => aa
+&#xAA; => &#xAA;
+
+CYRILLIC SMALL LETTER EL: &#x43B; => ab
+&#xAB; => &#xAB;
+
+CYRILLIC SMALL LETTER EM: &#x43C; => ac
+&#xAC; => &#xAC;
+
+CYRILLIC SMALL LETTER EN: &#x43D; => ad
+&#xAD; => &#xAD;
+
+CYRILLIC SMALL LETTER O: &#x43E; => ae
+&#xAE; => &#xAE;
+
+CYRILLIC SMALL LETTER PE: &#x43F; => af
+&#xAF; => &#xAF;
+
+LIGHT SHADE: &#x2591; => b0
+&#xB0; => f8
+
+MEDIUM SHADE: &#x2592; => b1
+&#xB1; => &#xB1;
+
+DARK SHADE: &#x2593; => b2
+&#xB2; => &#xB2;
+
+BOX DRAWINGS LIGHT VERTICAL: &#x2502; => b3
+&#xB3; => &#xB3;
+
+BOX DRAWINGS LIGHT VERTICAL AND LEFT: &#x2524; => b4
+&#xB4; => &#xB4;
+
+BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE: &#x2561; => b5
+&#xB5; => &#xB5;
+
+BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE: &#x2562; => b6
+&#xB6; => &#xB6;
+
+BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE: &#x2556; => b7
+&#xB7; => fa
+
+BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE: &#x2555; => b8
+&#xB8; => &#xB8;
+
+BOX DRAWINGS DOUBLE VERTICAL AND LEFT: &#x2563; => b9
+&#xB9; => &#xB9;
+
+BOX DRAWINGS DOUBLE VERTICAL: &#x2551; => ba
+&#xBA; => &#xBA;
+
+BOX DRAWINGS DOUBLE DOWN AND LEFT: &#x2557; => bb
+&#xBB; => &#xBB;
+
+BOX DRAWINGS DOUBLE UP AND LEFT: &#x255D; => bc
+&#xBC; => &#xBC;
+
+BOX DRAWINGS UP DOUBLE AND LEFT SINGLE: &#x255C; => bd
+&#xBD; => &#xBD;
+
+BOX DRAWINGS UP SINGLE AND LEFT DOUBLE: &#x255B; => be
+&#xBE; => &#xBE;
+
+BOX DRAWINGS LIGHT DOWN AND LEFT: &#x2510; => bf
+&#xBF; => &#xBF;
+
+BOX DRAWINGS LIGHT UP AND RIGHT: &#x2514; => c0
+&#xC0; => &#xC0;
+
+BOX DRAWINGS LIGHT UP AND HORIZONTAL: &#x2534; => c1
+&#xC1; => &#xC1;
+
+BOX DRAWINGS LIGHT DOWN AND HORIZONTAL: &#x252C; => c2
+&#xC2; => &#xC2;
+
+BOX DRAWINGS LIGHT VERTICAL AND RIGHT: &#x251C; => c3
+&#xC3; => &#xC3;
+
+BOX DRAWINGS LIGHT HORIZONTAL: &#x2500; => c4
+&#xC4; => &#xC4;
+
+BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL: &#x253C; => c5
+&#xC5; => &#xC5;
+
+BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE: &#x255E; => c6
+&#xC6; => &#xC6;
+
+BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE: &#x255F; => c7
+&#xC7; => &#xC7;
+
+BOX DRAWINGS DOUBLE UP AND RIGHT: &#x255A; => c8
+&#xC8; => &#xC8;
+
+BOX DRAWINGS DOUBLE DOWN AND RIGHT: &#x2554; => c9
+&#xC9; => &#xC9;
+
+BOX DRAWINGS DOUBLE UP AND HORIZONTAL: &#x2569; => ca
+&#xCA; => &#xCA;
+
+BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL: &#x2566; => cb
+&#xCB; => &#xCB;
+
+BOX DRAWINGS DOUBLE VERTICAL AND RIGHT: &#x2560; => cc
+&#xCC; => &#xCC;
+
+BOX DRAWINGS DOUBLE HORIZONTAL: &#x2550; => cd
+&#xCD; => &#xCD;
+
+BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL: &#x256C; => ce
+&#xCE; => &#xCE;
+
+BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE: &#x2567; => cf
+&#xCF; => &#xCF;
+
+BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE: &#x2568; => d0
+&#xD0; => &#xD0;
+
+BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE: &#x2564; => d1
+&#xD1; => &#xD1;
+
+BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE: &#x2565; => d2
+&#xD2; => &#xD2;
+
+BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE: &#x2559; => d3
+&#xD3; => &#xD3;
+
+BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE: &#x2558; => d4
+&#xD4; => &#xD4;
+
+BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE: &#x2552; => d5
+&#xD5; => &#xD5;
+
+BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE: &#x2553; => d6
+&#xD6; => &#xD6;
+
+BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE: &#x256B; => d7
+&#xD7; => &#xD7;
+
+BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE: &#x256A; => d8
+&#xD8; => &#xD8;
+
+BOX DRAWINGS LIGHT UP AND LEFT: &#x2518; => d9
+&#xD9; => &#xD9;
+
+BOX DRAWINGS LIGHT DOWN AND RIGHT: &#x250C; => da
+&#xDA; => &#xDA;
+
+FULL BLOCK: &#x2588; => db
+&#xDB; => &#xDB;
+
+LOWER HALF BLOCK: &#x2584; => dc
+&#xDC; => &#xDC;
+
+LEFT HALF BLOCK: &#x258C; => dd
+&#xDD; => &#xDD;
+
+RIGHT HALF BLOCK: &#x2590; => de
+&#xDE; => &#xDE;
+
+UPPER HALF BLOCK: &#x2580; => df
+&#xDF; => &#xDF;
+
+CYRILLIC SMALL LETTER ER: &#x440; => e0
+&#xE0; => &#xE0;
+
+CYRILLIC SMALL LETTER ES: &#x441; => e1
+&#xE1; => &#xE1;
+
+CYRILLIC SMALL LETTER TE: &#x442; => e2
+&#xE2; => &#xE2;
+
+CYRILLIC SMALL LETTER U: &#x443; => e3
+&#xE3; => &#xE3;
+
+CYRILLIC SMALL LETTER EF: &#x444; => e4
+&#xE4; => &#xE4;
+
+CYRILLIC SMALL LETTER HA: &#x445; => e5
+&#xE5; => &#xE5;
+
+CYRILLIC SMALL LETTER TSE: &#x446; => e6
+&#xE6; => &#xE6;
+
+CYRILLIC SMALL LETTER CHE: &#x447; => e7
+&#xE7; => &#xE7;
+
+CYRILLIC SMALL LETTER SHA: &#x448; => e8
+&#xE8; => &#xE8;
+
+CYRILLIC SMALL LETTER SHCHA: &#x449; => e9
+&#xE9; => &#xE9;
+
+CYRILLIC SMALL LETTER HARD SIGN: &#x44A; => ea
+&#xEA; => &#xEA;
+
+CYRILLIC SMALL LETTER YERU: &#x44B; => eb
+&#xEB; => &#xEB;
+
+CYRILLIC SMALL LETTER SOFT SIGN: &#x44C; => ec
+&#xEC; => &#xEC;
+
+CYRILLIC SMALL LETTER E: &#x44D; => ed
+&#xED; => &#xED;
+
+CYRILLIC SMALL LETTER YU: &#x44E; => ee
+&#xEE; => &#xEE;
+
+CYRILLIC SMALL LETTER YA: &#x44F; => ef
+&#xEF; => &#xEF;
+
+CYRILLIC CAPITAL LETTER IO: &#x401; => f0
+&#xF0; => &#xF0;
+
+CYRILLIC SMALL LETTER IO: &#x451; => f1
+&#xF1; => &#xF1;
+
+CYRILLIC CAPITAL LETTER UKRAINIAN IE: &#x404; => f2
+&#xF2; => &#xF2;
+
+CYRILLIC SMALL LETTER UKRAINIAN IE: &#x454; => f3
+&#xF3; => &#xF3;
+
+CYRILLIC CAPITAL LETTER YI: &#x407; => f4
+&#xF4; => &#xF4;
+
+CYRILLIC SMALL LETTER YI: &#x457; => f5
+&#xF5; => &#xF5;
+
+CYRILLIC CAPITAL LETTER SHORT U: &#x40E; => f6
+&#xF6; => &#xF6;
+
+CYRILLIC SMALL LETTER SHORT U: &#x45E; => f7
+&#xF7; => &#xF7;
+
+DEGREE SIGN: &#xB0; => f8
+&#xF8; => &#xF8;
+
+BULLET OPERATOR: &#x2219; => f9
+&#xF9; => &#xF9;
+
+MIDDLE DOT: &#xB7; => fa
+&#xFA; => &#xFA;
+
+SQUARE ROOT: &#x221A; => fb
+&#xFB; => &#xFB;
+
+NUMERO SIGN: &#x2116; => fc
+&#xFC; => &#xFC;
+
+CURRENCY SIGN: &#xA4; => fd
+&#xFD; => &#xFD;
+
+BLACK SQUARE: &#x25A0; => fe
+&#xFE; => &#xFE;
+
+NO-BREAK SPACE: &#xA0; => ff
+&#xFF; => &#xFF;
+
+
diff --git a/ext/standard/tests/strings/html_entity_decode_iso8859-15.phpt b/ext/standard/tests/strings/html_entity_decode_iso8859-15.phpt
new file mode 100644
index 0000000000..a3be8f3668
--- /dev/null
+++ b/ext/standard/tests/strings/html_entity_decode_iso8859-15.phpt
@@ -0,0 +1,405 @@
+--TEST--
+Translation of HTML entities for encoding ISO-8859-15
+--FILE--
+<?php
+$arr = array(
+0x00A0 => array(0xA0, "NO-BREAK SPACE"),
+0x00A1 => array(0xA1, "INVERTED EXCLAMATION MARK"),
+0x00A2 => array(0xA2, "CENT SIGN"),
+0x00A3 => array(0xA3, "POUND SIGN"),
+0x20AC => array(0xA4, "EURO SIGN"),
+0x00A5 => array(0xA5, "YEN SIGN"),
+0x0160 => array(0xA6, "LATIN CAPITAL LETTER S WITH CARON"),
+0x00A7 => array(0xA7, "SECTION SIGN"),
+0x0161 => array(0xA8, "LATIN SMALL LETTER S WITH CARON"),
+0x00A9 => array(0xA9, "COPYRIGHT SIGN"),
+0x00AA => array(0xAA, "FEMININE ORDINAL INDICATOR"),
+0x00AB => array(0xAB, "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK"),
+0x00AC => array(0xAC, "NOT SIGN"),
+0x00AD => array(0xAD, "SOFT HYPHEN"),
+0x00AE => array(0xAE, "REGISTERED SIGN"),
+0x00AF => array(0xAF, "MACRON"),
+0x00B0 => array(0xB0, "DEGREE SIGN"),
+0x00B1 => array(0xB1, "PLUS-MINUS SIGN"),
+0x00B2 => array(0xB2, "SUPERSCRIPT TWO"),
+0x00B3 => array(0xB3, "SUPERSCRIPT THREE"),
+0x017D => array(0xB4, "LATIN CAPITAL LETTER Z WITH CARON"),
+0x00B5 => array(0xB5, "MICRO SIGN"),
+0x00B6 => array(0xB6, "PILCROW SIGN"),
+0x00B7 => array(0xB7, "MIDDLE DOT"),
+0x017E => array(0xB8, "LATIN SMALL LETTER Z WITH CARON"),
+0x00B9 => array(0xB9, "SUPERSCRIPT ONE"),
+0x00BA => array(0xBA, "MASCULINE ORDINAL INDICATOR"),
+0x00BB => array(0xBB, "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK"),
+0x0152 => array(0xBC, "LATIN CAPITAL LIGATURE OE"),
+0x0153 => array(0xBD, "LATIN SMALL LIGATURE OE"),
+0x0178 => array(0xBE, "LATIN CAPITAL LETTER Y WITH DIAERESIS"),
+0x00BF => array(0xBF, "INVERTED QUESTION MARK"),
+0x00C0 => array(0xC0, "LATIN CAPITAL LETTER A WITH GRAVE"),
+0x00C1 => array(0xC1, "LATIN CAPITAL LETTER A WITH ACUTE"),
+0x00C2 => array(0xC2, "LATIN CAPITAL LETTER A WITH CIRCUMFLEX"),
+0x00C3 => array(0xC3, "LATIN CAPITAL LETTER A WITH TILDE"),
+0x00C4 => array(0xC4, "LATIN CAPITAL LETTER A WITH DIAERESIS"),
+0x00C5 => array(0xC5, "LATIN CAPITAL LETTER A WITH RING ABOVE"),
+0x00C6 => array(0xC6, "LATIN CAPITAL LETTER AE"),
+0x00C7 => array(0xC7, "LATIN CAPITAL LETTER C WITH CEDILLA"),
+0x00C8 => array(0xC8, "LATIN CAPITAL LETTER E WITH GRAVE"),
+0x00C9 => array(0xC9, "LATIN CAPITAL LETTER E WITH ACUTE"),
+0x00CA => array(0xCA, "LATIN CAPITAL LETTER E WITH CIRCUMFLEX"),
+0x00CB => array(0xCB, "LATIN CAPITAL LETTER E WITH DIAERESIS"),
+0x00CC => array(0xCC, "LATIN CAPITAL LETTER I WITH GRAVE"),
+0x00CD => array(0xCD, "LATIN CAPITAL LETTER I WITH ACUTE"),
+0x00CE => array(0xCE, "LATIN CAPITAL LETTER I WITH CIRCUMFLEX"),
+0x00CF => array(0xCF, "LATIN CAPITAL LETTER I WITH DIAERESIS"),
+0x00D0 => array(0xD0, "LATIN CAPITAL LETTER ETH"),
+0x00D1 => array(0xD1, "LATIN CAPITAL LETTER N WITH TILDE"),
+0x00D2 => array(0xD2, "LATIN CAPITAL LETTER O WITH GRAVE"),
+0x00D3 => array(0xD3, "LATIN CAPITAL LETTER O WITH ACUTE"),
+0x00D4 => array(0xD4, "LATIN CAPITAL LETTER O WITH CIRCUMFLEX"),
+0x00D5 => array(0xD5, "LATIN CAPITAL LETTER O WITH TILDE"),
+0x00D6 => array(0xD6, "LATIN CAPITAL LETTER O WITH DIAERESIS"),
+0x00D7 => array(0xD7, "MULTIPLICATION SIGN"),
+0x00D8 => array(0xD8, "LATIN CAPITAL LETTER O WITH STROKE"),
+0x00D9 => array(0xD9, "LATIN CAPITAL LETTER U WITH GRAVE"),
+0x00DA => array(0xDA, "LATIN CAPITAL LETTER U WITH ACUTE"),
+0x00DB => array(0xDB, "LATIN CAPITAL LETTER U WITH CIRCUMFLEX"),
+0x00DC => array(0xDC, "LATIN CAPITAL LETTER U WITH DIAERESIS"),
+0x00DD => array(0xDD, "LATIN CAPITAL LETTER Y WITH ACUTE"),
+0x00DE => array(0xDE, "LATIN CAPITAL LETTER THORN"),
+0x00DF => array(0xDF, "LATIN SMALL LETTER SHARP S"),
+0x00E0 => array(0xE0, "LATIN SMALL LETTER A WITH GRAVE"),
+0x00E1 => array(0xE1, "LATIN SMALL LETTER A WITH ACUTE"),
+0x00E2 => array(0xE2, "LATIN SMALL LETTER A WITH CIRCUMFLEX"),
+0x00E3 => array(0xE3, "LATIN SMALL LETTER A WITH TILDE"),
+0x00E4 => array(0xE4, "LATIN SMALL LETTER A WITH DIAERESIS"),
+0x00E5 => array(0xE5, "LATIN SMALL LETTER A WITH RING ABOVE"),
+0x00E6 => array(0xE6, "LATIN SMALL LETTER AE"),
+0x00E7 => array(0xE7, "LATIN SMALL LETTER C WITH CEDILLA"),
+0x00E8 => array(0xE8, "LATIN SMALL LETTER E WITH GRAVE"),
+0x00E9 => array(0xE9, "LATIN SMALL LETTER E WITH ACUTE"),
+0x00EA => array(0xEA, "LATIN SMALL LETTER E WITH CIRCUMFLEX"),
+0x00EB => array(0xEB, "LATIN SMALL LETTER E WITH DIAERESIS"),
+0x00EC => array(0xEC, "LATIN SMALL LETTER I WITH GRAVE"),
+0x00ED => array(0xED, "LATIN SMALL LETTER I WITH ACUTE"),
+0x00EE => array(0xEE, "LATIN SMALL LETTER I WITH CIRCUMFLEX"),
+0x00EF => array(0xEF, "LATIN SMALL LETTER I WITH DIAERESIS"),
+0x00F0 => array(0xF0, "LATIN SMALL LETTER ETH"),
+0x00F1 => array(0xF1, "LATIN SMALL LETTER N WITH TILDE"),
+0x00F2 => array(0xF2, "LATIN SMALL LETTER O WITH GRAVE"),
+0x00F3 => array(0xF3, "LATIN SMALL LETTER O WITH ACUTE"),
+0x00F4 => array(0xF4, "LATIN SMALL LETTER O WITH CIRCUMFLEX"),
+0x00F5 => array(0xF5, "LATIN SMALL LETTER O WITH TILDE"),
+0x00F6 => array(0xF6, "LATIN SMALL LETTER O WITH DIAERESIS"),
+0x00F7 => array(0xF7, "DIVISION SIGN"),
+0x00F8 => array(0xF8, "LATIN SMALL LETTER O WITH STROKE"),
+0x00F9 => array(0xF9, "LATIN SMALL LETTER U WITH GRAVE"),
+0x00FA => array(0xFA, "LATIN SMALL LETTER U WITH ACUTE"),
+0x00FB => array(0xFB, "LATIN SMALL LETTER U WITH CIRCUMFLEX"),
+0x00FC => array(0xFC, "LATIN SMALL LETTER U WITH DIAERESIS"),
+0x00FD => array(0xFD, "LATIN SMALL LETTER Y WITH ACUTE"),
+0x00FE => array(0xFE, "LATIN SMALL LETTER THORN"),
+0x00FF => array(0xFF, "LATIN SMALL LETTER Y WITH DIAERESIS"),
+);
+
+foreach ($arr as $u => $v) {
+ $ent = sprintf("&#x%X;", $u);
+ $res = html_entity_decode($ent, ENT_QUOTES, 'ISO-8859-15');
+ $d = unpack("H*", $res);
+ echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]);
+
+ $ent = sprintf("&#x%X;", $v[0]);
+ $res = html_entity_decode($ent, ENT_QUOTES, 'ISO-8859-15');
+ if ($res[0] != "&" || $res[1] != "#")
+ $res = unpack("H*", $res)[1];
+ echo sprintf("%s => %s\n\n", $ent, $res);
+}
+--EXPECT--
+NO-BREAK SPACE: &#xA0; => a0
+&#xA0; => a0
+
+INVERTED EXCLAMATION MARK: &#xA1; => a1
+&#xA1; => a1
+
+CENT SIGN: &#xA2; => a2
+&#xA2; => a2
+
+POUND SIGN: &#xA3; => a3
+&#xA3; => a3
+
+EURO SIGN: &#x20AC; => a4
+&#xA4; => &#xA4;
+
+YEN SIGN: &#xA5; => a5
+&#xA5; => a5
+
+LATIN CAPITAL LETTER S WITH CARON: &#x160; => a6
+&#xA6; => &#xA6;
+
+SECTION SIGN: &#xA7; => a7
+&#xA7; => a7
+
+LATIN SMALL LETTER S WITH CARON: &#x161; => a8
+&#xA8; => &#xA8;
+
+COPYRIGHT SIGN: &#xA9; => a9
+&#xA9; => a9
+
+FEMININE ORDINAL INDICATOR: &#xAA; => aa
+&#xAA; => aa
+
+LEFT-POINTING DOUBLE ANGLE QUOTATION MARK: &#xAB; => ab
+&#xAB; => ab
+
+NOT SIGN: &#xAC; => ac
+&#xAC; => ac
+
+SOFT HYPHEN: &#xAD; => ad
+&#xAD; => ad
+
+REGISTERED SIGN: &#xAE; => ae
+&#xAE; => ae
+
+MACRON: &#xAF; => af
+&#xAF; => af
+
+DEGREE SIGN: &#xB0; => b0
+&#xB0; => b0
+
+PLUS-MINUS SIGN: &#xB1; => b1
+&#xB1; => b1
+
+SUPERSCRIPT TWO: &#xB2; => b2
+&#xB2; => b2
+
+SUPERSCRIPT THREE: &#xB3; => b3
+&#xB3; => b3
+
+LATIN CAPITAL LETTER Z WITH CARON: &#x17D; => b4
+&#xB4; => &#xB4;
+
+MICRO SIGN: &#xB5; => b5
+&#xB5; => b5
+
+PILCROW SIGN: &#xB6; => b6
+&#xB6; => b6
+
+MIDDLE DOT: &#xB7; => b7
+&#xB7; => b7
+
+LATIN SMALL LETTER Z WITH CARON: &#x17E; => b8
+&#xB8; => &#xB8;
+
+SUPERSCRIPT ONE: &#xB9; => b9
+&#xB9; => b9
+
+MASCULINE ORDINAL INDICATOR: &#xBA; => ba
+&#xBA; => ba
+
+RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK: &#xBB; => bb
+&#xBB; => bb
+
+LATIN CAPITAL LIGATURE OE: &#x152; => bc
+&#xBC; => &#xBC;
+
+LATIN SMALL LIGATURE OE: &#x153; => bd
+&#xBD; => &#xBD;
+
+LATIN CAPITAL LETTER Y WITH DIAERESIS: &#x178; => be
+&#xBE; => &#xBE;
+
+INVERTED QUESTION MARK: &#xBF; => bf
+&#xBF; => bf
+
+LATIN CAPITAL LETTER A WITH GRAVE: &#xC0; => c0
+&#xC0; => c0
+
+LATIN CAPITAL LETTER A WITH ACUTE: &#xC1; => c1
+&#xC1; => c1
+
+LATIN CAPITAL LETTER A WITH CIRCUMFLEX: &#xC2; => c2
+&#xC2; => c2
+
+LATIN CAPITAL LETTER A WITH TILDE: &#xC3; => c3
+&#xC3; => c3
+
+LATIN CAPITAL LETTER A WITH DIAERESIS: &#xC4; => c4
+&#xC4; => c4
+
+LATIN CAPITAL LETTER A WITH RING ABOVE: &#xC5; => c5
+&#xC5; => c5
+
+LATIN CAPITAL LETTER AE: &#xC6; => c6
+&#xC6; => c6
+
+LATIN CAPITAL LETTER C WITH CEDILLA: &#xC7; => c7
+&#xC7; => c7
+
+LATIN CAPITAL LETTER E WITH GRAVE: &#xC8; => c8
+&#xC8; => c8
+
+LATIN CAPITAL LETTER E WITH ACUTE: &#xC9; => c9
+&#xC9; => c9
+
+LATIN CAPITAL LETTER E WITH CIRCUMFLEX: &#xCA; => ca
+&#xCA; => ca
+
+LATIN CAPITAL LETTER E WITH DIAERESIS: &#xCB; => cb
+&#xCB; => cb
+
+LATIN CAPITAL LETTER I WITH GRAVE: &#xCC; => cc
+&#xCC; => cc
+
+LATIN CAPITAL LETTER I WITH ACUTE: &#xCD; => cd
+&#xCD; => cd
+
+LATIN CAPITAL LETTER I WITH CIRCUMFLEX: &#xCE; => ce
+&#xCE; => ce
+
+LATIN CAPITAL LETTER I WITH DIAERESIS: &#xCF; => cf
+&#xCF; => cf
+
+LATIN CAPITAL LETTER ETH: &#xD0; => d0
+&#xD0; => d0
+
+LATIN CAPITAL LETTER N WITH TILDE: &#xD1; => d1
+&#xD1; => d1
+
+LATIN CAPITAL LETTER O WITH GRAVE: &#xD2; => d2
+&#xD2; => d2
+
+LATIN CAPITAL LETTER O WITH ACUTE: &#xD3; => d3
+&#xD3; => d3
+
+LATIN CAPITAL LETTER O WITH CIRCUMFLEX: &#xD4; => d4
+&#xD4; => d4
+
+LATIN CAPITAL LETTER O WITH TILDE: &#xD5; => d5
+&#xD5; => d5
+
+LATIN CAPITAL LETTER O WITH DIAERESIS: &#xD6; => d6
+&#xD6; => d6
+
+MULTIPLICATION SIGN: &#xD7; => d7
+&#xD7; => d7
+
+LATIN CAPITAL LETTER O WITH STROKE: &#xD8; => d8
+&#xD8; => d8
+
+LATIN CAPITAL LETTER U WITH GRAVE: &#xD9; => d9
+&#xD9; => d9
+
+LATIN CAPITAL LETTER U WITH ACUTE: &#xDA; => da
+&#xDA; => da
+
+LATIN CAPITAL LETTER U WITH CIRCUMFLEX: &#xDB; => db
+&#xDB; => db
+
+LATIN CAPITAL LETTER U WITH DIAERESIS: &#xDC; => dc
+&#xDC; => dc
+
+LATIN CAPITAL LETTER Y WITH ACUTE: &#xDD; => dd
+&#xDD; => dd
+
+LATIN CAPITAL LETTER THORN: &#xDE; => de
+&#xDE; => de
+
+LATIN SMALL LETTER SHARP S: &#xDF; => df
+&#xDF; => df
+
+LATIN SMALL LETTER A WITH GRAVE: &#xE0; => e0
+&#xE0; => e0
+
+LATIN SMALL LETTER A WITH ACUTE: &#xE1; => e1
+&#xE1; => e1
+
+LATIN SMALL LETTER A WITH CIRCUMFLEX: &#xE2; => e2
+&#xE2; => e2
+
+LATIN SMALL LETTER A WITH TILDE: &#xE3; => e3
+&#xE3; => e3
+
+LATIN SMALL LETTER A WITH DIAERESIS: &#xE4; => e4
+&#xE4; => e4
+
+LATIN SMALL LETTER A WITH RING ABOVE: &#xE5; => e5
+&#xE5; => e5
+
+LATIN SMALL LETTER AE: &#xE6; => e6
+&#xE6; => e6
+
+LATIN SMALL LETTER C WITH CEDILLA: &#xE7; => e7
+&#xE7; => e7
+
+LATIN SMALL LETTER E WITH GRAVE: &#xE8; => e8
+&#xE8; => e8
+
+LATIN SMALL LETTER E WITH ACUTE: &#xE9; => e9
+&#xE9; => e9
+
+LATIN SMALL LETTER E WITH CIRCUMFLEX: &#xEA; => ea
+&#xEA; => ea
+
+LATIN SMALL LETTER E WITH DIAERESIS: &#xEB; => eb
+&#xEB; => eb
+
+LATIN SMALL LETTER I WITH GRAVE: &#xEC; => ec
+&#xEC; => ec
+
+LATIN SMALL LETTER I WITH ACUTE: &#xED; => ed
+&#xED; => ed
+
+LATIN SMALL LETTER I WITH CIRCUMFLEX: &#xEE; => ee
+&#xEE; => ee
+
+LATIN SMALL LETTER I WITH DIAERESIS: &#xEF; => ef
+&#xEF; => ef
+
+LATIN SMALL LETTER ETH: &#xF0; => f0
+&#xF0; => f0
+
+LATIN SMALL LETTER N WITH TILDE: &#xF1; => f1
+&#xF1; => f1
+
+LATIN SMALL LETTER O WITH GRAVE: &#xF2; => f2
+&#xF2; => f2
+
+LATIN SMALL LETTER O WITH ACUTE: &#xF3; => f3
+&#xF3; => f3
+
+LATIN SMALL LETTER O WITH CIRCUMFLEX: &#xF4; => f4
+&#xF4; => f4
+
+LATIN SMALL LETTER O WITH TILDE: &#xF5; => f5
+&#xF5; => f5
+
+LATIN SMALL LETTER O WITH DIAERESIS: &#xF6; => f6
+&#xF6; => f6
+
+DIVISION SIGN: &#xF7; => f7
+&#xF7; => f7
+
+LATIN SMALL LETTER O WITH STROKE: &#xF8; => f8
+&#xF8; => f8
+
+LATIN SMALL LETTER U WITH GRAVE: &#xF9; => f9
+&#xF9; => f9
+
+LATIN SMALL LETTER U WITH ACUTE: &#xFA; => fa
+&#xFA; => fa
+
+LATIN SMALL LETTER U WITH CIRCUMFLEX: &#xFB; => fb
+&#xFB; => fb
+
+LATIN SMALL LETTER U WITH DIAERESIS: &#xFC; => fc
+&#xFC; => fc
+
+LATIN SMALL LETTER Y WITH ACUTE: &#xFD; => fd
+&#xFD; => fd
+
+LATIN SMALL LETTER THORN: &#xFE; => fe
+&#xFE; => fe
+
+LATIN SMALL LETTER Y WITH DIAERESIS: &#xFF; => ff
+&#xFF; => ff
+
+
diff --git a/ext/standard/tests/strings/html_entity_decode_iso8859-5.phpt b/ext/standard/tests/strings/html_entity_decode_iso8859-5.phpt
new file mode 100644
index 0000000000..6a65413c9c
--- /dev/null
+++ b/ext/standard/tests/strings/html_entity_decode_iso8859-5.phpt
@@ -0,0 +1,405 @@
+--TEST--
+Translation of HTML entities for encoding ISO-8859-5
+--FILE--
+<?php
+$arr = array(
+0x00A0 => array(0xA0, "NO-BREAK SPACE"),
+0x0401 => array(0xA1, "CYRILLIC CAPITAL LETTER IO"),
+0x0402 => array(0xA2, "CYRILLIC CAPITAL LETTER DJE"),
+0x0403 => array(0xA3, "CYRILLIC CAPITAL LETTER GJE"),
+0x0404 => array(0xA4, "CYRILLIC CAPITAL LETTER UKRAINIAN IE"),
+0x0405 => array(0xA5, "CYRILLIC CAPITAL LETTER DZE"),
+0x0406 => array(0xA6, "CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I"),
+0x0407 => array(0xA7, "CYRILLIC CAPITAL LETTER YI"),
+0x0408 => array(0xA8, "CYRILLIC CAPITAL LETTER JE"),
+0x0409 => array(0xA9, "CYRILLIC CAPITAL LETTER LJE"),
+0x040A => array(0xAA, "CYRILLIC CAPITAL LETTER NJE"),
+0x040B => array(0xAB, "CYRILLIC CAPITAL LETTER TSHE"),
+0x040C => array(0xAC, "CYRILLIC CAPITAL LETTER KJE"),
+0x00AD => array(0xAD, "SOFT HYPHEN"),
+0x040E => array(0xAE, "CYRILLIC CAPITAL LETTER SHORT U"),
+0x040F => array(0xAF, "CYRILLIC CAPITAL LETTER DZHE"),
+0x0410 => array(0xB0, "CYRILLIC CAPITAL LETTER A"),
+0x0411 => array(0xB1, "CYRILLIC CAPITAL LETTER BE"),
+0x0412 => array(0xB2, "CYRILLIC CAPITAL LETTER VE"),
+0x0413 => array(0xB3, "CYRILLIC CAPITAL LETTER GHE"),
+0x0414 => array(0xB4, "CYRILLIC CAPITAL LETTER DE"),
+0x0415 => array(0xB5, "CYRILLIC CAPITAL LETTER IE"),
+0x0416 => array(0xB6, "CYRILLIC CAPITAL LETTER ZHE"),
+0x0417 => array(0xB7, "CYRILLIC CAPITAL LETTER ZE"),
+0x0418 => array(0xB8, "CYRILLIC CAPITAL LETTER I"),
+0x0419 => array(0xB9, "CYRILLIC CAPITAL LETTER SHORT I"),
+0x041A => array(0xBA, "CYRILLIC CAPITAL LETTER KA"),
+0x041B => array(0xBB, "CYRILLIC CAPITAL LETTER EL"),
+0x041C => array(0xBC, "CYRILLIC CAPITAL LETTER EM"),
+0x041D => array(0xBD, "CYRILLIC CAPITAL LETTER EN"),
+0x041E => array(0xBE, "CYRILLIC CAPITAL LETTER O"),
+0x041F => array(0xBF, "CYRILLIC CAPITAL LETTER PE"),
+0x0420 => array(0xC0, "CYRILLIC CAPITAL LETTER ER"),
+0x0421 => array(0xC1, "CYRILLIC CAPITAL LETTER ES"),
+0x0422 => array(0xC2, "CYRILLIC CAPITAL LETTER TE"),
+0x0423 => array(0xC3, "CYRILLIC CAPITAL LETTER U"),
+0x0424 => array(0xC4, "CYRILLIC CAPITAL LETTER EF"),
+0x0425 => array(0xC5, "CYRILLIC CAPITAL LETTER HA"),
+0x0426 => array(0xC6, "CYRILLIC CAPITAL LETTER TSE"),
+0x0427 => array(0xC7, "CYRILLIC CAPITAL LETTER CHE"),
+0x0428 => array(0xC8, "CYRILLIC CAPITAL LETTER SHA"),
+0x0429 => array(0xC9, "CYRILLIC CAPITAL LETTER SHCHA"),
+0x042A => array(0xCA, "CYRILLIC CAPITAL LETTER HARD SIGN"),
+0x042B => array(0xCB, "CYRILLIC CAPITAL LETTER YERU"),
+0x042C => array(0xCC, "CYRILLIC CAPITAL LETTER SOFT SIGN"),
+0x042D => array(0xCD, "CYRILLIC CAPITAL LETTER E"),
+0x042E => array(0xCE, "CYRILLIC CAPITAL LETTER YU"),
+0x042F => array(0xCF, "CYRILLIC CAPITAL LETTER YA"),
+0x0430 => array(0xD0, "CYRILLIC SMALL LETTER A"),
+0x0431 => array(0xD1, "CYRILLIC SMALL LETTER BE"),
+0x0432 => array(0xD2, "CYRILLIC SMALL LETTER VE"),
+0x0433 => array(0xD3, "CYRILLIC SMALL LETTER GHE"),
+0x0434 => array(0xD4, "CYRILLIC SMALL LETTER DE"),
+0x0435 => array(0xD5, "CYRILLIC SMALL LETTER IE"),
+0x0436 => array(0xD6, "CYRILLIC SMALL LETTER ZHE"),
+0x0437 => array(0xD7, "CYRILLIC SMALL LETTER ZE"),
+0x0438 => array(0xD8, "CYRILLIC SMALL LETTER I"),
+0x0439 => array(0xD9, "CYRILLIC SMALL LETTER SHORT I"),
+0x043A => array(0xDA, "CYRILLIC SMALL LETTER KA"),
+0x043B => array(0xDB, "CYRILLIC SMALL LETTER EL"),
+0x043C => array(0xDC, "CYRILLIC SMALL LETTER EM"),
+0x043D => array(0xDD, "CYRILLIC SMALL LETTER EN"),
+0x043E => array(0xDE, "CYRILLIC SMALL LETTER O"),
+0x043F => array(0xDF, "CYRILLIC SMALL LETTER PE"),
+0x0440 => array(0xE0, "CYRILLIC SMALL LETTER ER"),
+0x0441 => array(0xE1, "CYRILLIC SMALL LETTER ES"),
+0x0442 => array(0xE2, "CYRILLIC SMALL LETTER TE"),
+0x0443 => array(0xE3, "CYRILLIC SMALL LETTER U"),
+0x0444 => array(0xE4, "CYRILLIC SMALL LETTER EF"),
+0x0445 => array(0xE5, "CYRILLIC SMALL LETTER HA"),
+0x0446 => array(0xE6, "CYRILLIC SMALL LETTER TSE"),
+0x0447 => array(0xE7, "CYRILLIC SMALL LETTER CHE"),
+0x0448 => array(0xE8, "CYRILLIC SMALL LETTER SHA"),
+0x0449 => array(0xE9, "CYRILLIC SMALL LETTER SHCHA"),
+0x044A => array(0xEA, "CYRILLIC SMALL LETTER HARD SIGN"),
+0x044B => array(0xEB, "CYRILLIC SMALL LETTER YERU"),
+0x044C => array(0xEC, "CYRILLIC SMALL LETTER SOFT SIGN"),
+0x044D => array(0xED, "CYRILLIC SMALL LETTER E"),
+0x044E => array(0xEE, "CYRILLIC SMALL LETTER YU"),
+0x044F => array(0xEF, "CYRILLIC SMALL LETTER YA"),
+0x2116 => array(0xF0, "NUMERO SIGN"),
+0x0451 => array(0xF1, "CYRILLIC SMALL LETTER IO"),
+0x0452 => array(0xF2, "CYRILLIC SMALL LETTER DJE"),
+0x0453 => array(0xF3, "CYRILLIC SMALL LETTER GJE"),
+0x0454 => array(0xF4, "CYRILLIC SMALL LETTER UKRAINIAN IE"),
+0x0455 => array(0xF5, "CYRILLIC SMALL LETTER DZE"),
+0x0456 => array(0xF6, "CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I"),
+0x0457 => array(0xF7, "CYRILLIC SMALL LETTER YI"),
+0x0458 => array(0xF8, "CYRILLIC SMALL LETTER JE"),
+0x0459 => array(0xF9, "CYRILLIC SMALL LETTER LJE"),
+0x045A => array(0xFA, "CYRILLIC SMALL LETTER NJE"),
+0x045B => array(0xFB, "CYRILLIC SMALL LETTER TSHE"),
+0x045C => array(0xFC, "CYRILLIC SMALL LETTER KJE"),
+0x00A7 => array(0xFD, "SECTION SIGN"),
+0x045E => array(0xFE, "CYRILLIC SMALL LETTER SHORT U"),
+0x045F => array(0xFF, "CYRILLIC SMALL LETTER DZHE"),
+);
+
+foreach ($arr as $u => $v) {
+ $ent = sprintf("&#x%X;", $u);
+ $res = html_entity_decode($ent, ENT_QUOTES, 'ISO-8859-5');
+ $d = unpack("H*", $res);
+ echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]);
+
+ $ent = sprintf("&#x%X;", $v[0]);
+ $res = html_entity_decode($ent, ENT_QUOTES, 'ISO-8859-5');
+ if ($res[0] != "&" || $res[1] != "#")
+ $res = unpack("H*", $res)[1];
+ echo sprintf("%s => %s\n\n", $ent, $res);
+}
+--EXPECT--
+NO-BREAK SPACE: &#xA0; => a0
+&#xA0; => a0
+
+CYRILLIC CAPITAL LETTER IO: &#x401; => a1
+&#xA1; => &#xA1;
+
+CYRILLIC CAPITAL LETTER DJE: &#x402; => a2
+&#xA2; => &#xA2;
+
+CYRILLIC CAPITAL LETTER GJE: &#x403; => a3
+&#xA3; => &#xA3;
+
+CYRILLIC CAPITAL LETTER UKRAINIAN IE: &#x404; => a4
+&#xA4; => &#xA4;
+
+CYRILLIC CAPITAL LETTER DZE: &#x405; => a5
+&#xA5; => &#xA5;
+
+CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I: &#x406; => a6
+&#xA6; => &#xA6;
+
+CYRILLIC CAPITAL LETTER YI: &#x407; => a7
+&#xA7; => fd
+
+CYRILLIC CAPITAL LETTER JE: &#x408; => a8
+&#xA8; => &#xA8;
+
+CYRILLIC CAPITAL LETTER LJE: &#x409; => a9
+&#xA9; => &#xA9;
+
+CYRILLIC CAPITAL LETTER NJE: &#x40A; => aa
+&#xAA; => &#xAA;
+
+CYRILLIC CAPITAL LETTER TSHE: &#x40B; => ab
+&#xAB; => &#xAB;
+
+CYRILLIC CAPITAL LETTER KJE: &#x40C; => ac
+&#xAC; => &#xAC;
+
+SOFT HYPHEN: &#xAD; => ad
+&#xAD; => ad
+
+CYRILLIC CAPITAL LETTER SHORT U: &#x40E; => ae
+&#xAE; => &#xAE;
+
+CYRILLIC CAPITAL LETTER DZHE: &#x40F; => af
+&#xAF; => &#xAF;
+
+CYRILLIC CAPITAL LETTER A: &#x410; => b0
+&#xB0; => &#xB0;
+
+CYRILLIC CAPITAL LETTER BE: &#x411; => b1
+&#xB1; => &#xB1;
+
+CYRILLIC CAPITAL LETTER VE: &#x412; => b2
+&#xB2; => &#xB2;
+
+CYRILLIC CAPITAL LETTER GHE: &#x413; => b3
+&#xB3; => &#xB3;
+
+CYRILLIC CAPITAL LETTER DE: &#x414; => b4
+&#xB4; => &#xB4;
+
+CYRILLIC CAPITAL LETTER IE: &#x415; => b5
+&#xB5; => &#xB5;
+
+CYRILLIC CAPITAL LETTER ZHE: &#x416; => b6
+&#xB6; => &#xB6;
+
+CYRILLIC CAPITAL LETTER ZE: &#x417; => b7
+&#xB7; => &#xB7;
+
+CYRILLIC CAPITAL LETTER I: &#x418; => b8
+&#xB8; => &#xB8;
+
+CYRILLIC CAPITAL LETTER SHORT I: &#x419; => b9
+&#xB9; => &#xB9;
+
+CYRILLIC CAPITAL LETTER KA: &#x41A; => ba
+&#xBA; => &#xBA;
+
+CYRILLIC CAPITAL LETTER EL: &#x41B; => bb
+&#xBB; => &#xBB;
+
+CYRILLIC CAPITAL LETTER EM: &#x41C; => bc
+&#xBC; => &#xBC;
+
+CYRILLIC CAPITAL LETTER EN: &#x41D; => bd
+&#xBD; => &#xBD;
+
+CYRILLIC CAPITAL LETTER O: &#x41E; => be
+&#xBE; => &#xBE;
+
+CYRILLIC CAPITAL LETTER PE: &#x41F; => bf
+&#xBF; => &#xBF;
+
+CYRILLIC CAPITAL LETTER ER: &#x420; => c0
+&#xC0; => &#xC0;
+
+CYRILLIC CAPITAL LETTER ES: &#x421; => c1
+&#xC1; => &#xC1;
+
+CYRILLIC CAPITAL LETTER TE: &#x422; => c2
+&#xC2; => &#xC2;
+
+CYRILLIC CAPITAL LETTER U: &#x423; => c3
+&#xC3; => &#xC3;
+
+CYRILLIC CAPITAL LETTER EF: &#x424; => c4
+&#xC4; => &#xC4;
+
+CYRILLIC CAPITAL LETTER HA: &#x425; => c5
+&#xC5; => &#xC5;
+
+CYRILLIC CAPITAL LETTER TSE: &#x426; => c6
+&#xC6; => &#xC6;
+
+CYRILLIC CAPITAL LETTER CHE: &#x427; => c7
+&#xC7; => &#xC7;
+
+CYRILLIC CAPITAL LETTER SHA: &#x428; => c8
+&#xC8; => &#xC8;
+
+CYRILLIC CAPITAL LETTER SHCHA: &#x429; => c9
+&#xC9; => &#xC9;
+
+CYRILLIC CAPITAL LETTER HARD SIGN: &#x42A; => ca
+&#xCA; => &#xCA;
+
+CYRILLIC CAPITAL LETTER YERU: &#x42B; => cb
+&#xCB; => &#xCB;
+
+CYRILLIC CAPITAL LETTER SOFT SIGN: &#x42C; => cc
+&#xCC; => &#xCC;
+
+CYRILLIC CAPITAL LETTER E: &#x42D; => cd
+&#xCD; => &#xCD;
+
+CYRILLIC CAPITAL LETTER YU: &#x42E; => ce
+&#xCE; => &#xCE;
+
+CYRILLIC CAPITAL LETTER YA: &#x42F; => cf
+&#xCF; => &#xCF;
+
+CYRILLIC SMALL LETTER A: &#x430; => d0
+&#xD0; => &#xD0;
+
+CYRILLIC SMALL LETTER BE: &#x431; => d1
+&#xD1; => &#xD1;
+
+CYRILLIC SMALL LETTER VE: &#x432; => d2
+&#xD2; => &#xD2;
+
+CYRILLIC SMALL LETTER GHE: &#x433; => d3
+&#xD3; => &#xD3;
+
+CYRILLIC SMALL LETTER DE: &#x434; => d4
+&#xD4; => &#xD4;
+
+CYRILLIC SMALL LETTER IE: &#x435; => d5
+&#xD5; => &#xD5;
+
+CYRILLIC SMALL LETTER ZHE: &#x436; => d6
+&#xD6; => &#xD6;
+
+CYRILLIC SMALL LETTER ZE: &#x437; => d7
+&#xD7; => &#xD7;
+
+CYRILLIC SMALL LETTER I: &#x438; => d8
+&#xD8; => &#xD8;
+
+CYRILLIC SMALL LETTER SHORT I: &#x439; => d9
+&#xD9; => &#xD9;
+
+CYRILLIC SMALL LETTER KA: &#x43A; => da
+&#xDA; => &#xDA;
+
+CYRILLIC SMALL LETTER EL: &#x43B; => db
+&#xDB; => &#xDB;
+
+CYRILLIC SMALL LETTER EM: &#x43C; => dc
+&#xDC; => &#xDC;
+
+CYRILLIC SMALL LETTER EN: &#x43D; => dd
+&#xDD; => &#xDD;
+
+CYRILLIC SMALL LETTER O: &#x43E; => de
+&#xDE; => &#xDE;
+
+CYRILLIC SMALL LETTER PE: &#x43F; => df
+&#xDF; => &#xDF;
+
+CYRILLIC SMALL LETTER ER: &#x440; => e0
+&#xE0; => &#xE0;
+
+CYRILLIC SMALL LETTER ES: &#x441; => e1
+&#xE1; => &#xE1;
+
+CYRILLIC SMALL LETTER TE: &#x442; => e2
+&#xE2; => &#xE2;
+
+CYRILLIC SMALL LETTER U: &#x443; => e3
+&#xE3; => &#xE3;
+
+CYRILLIC SMALL LETTER EF: &#x444; => e4
+&#xE4; => &#xE4;
+
+CYRILLIC SMALL LETTER HA: &#x445; => e5
+&#xE5; => &#xE5;
+
+CYRILLIC SMALL LETTER TSE: &#x446; => e6
+&#xE6; => &#xE6;
+
+CYRILLIC SMALL LETTER CHE: &#x447; => e7
+&#xE7; => &#xE7;
+
+CYRILLIC SMALL LETTER SHA: &#x448; => e8
+&#xE8; => &#xE8;
+
+CYRILLIC SMALL LETTER SHCHA: &#x449; => e9
+&#xE9; => &#xE9;
+
+CYRILLIC SMALL LETTER HARD SIGN: &#x44A; => ea
+&#xEA; => &#xEA;
+
+CYRILLIC SMALL LETTER YERU: &#x44B; => eb
+&#xEB; => &#xEB;
+
+CYRILLIC SMALL LETTER SOFT SIGN: &#x44C; => ec
+&#xEC; => &#xEC;
+
+CYRILLIC SMALL LETTER E: &#x44D; => ed
+&#xED; => &#xED;
+
+CYRILLIC SMALL LETTER YU: &#x44E; => ee
+&#xEE; => &#xEE;
+
+CYRILLIC SMALL LETTER YA: &#x44F; => ef
+&#xEF; => &#xEF;
+
+NUMERO SIGN: &#x2116; => f0
+&#xF0; => &#xF0;
+
+CYRILLIC SMALL LETTER IO: &#x451; => 2623783435313b
+&#xF1; => &#xF1;
+
+CYRILLIC SMALL LETTER DJE: &#x452; => 2623783435323b
+&#xF2; => &#xF2;
+
+CYRILLIC SMALL LETTER GJE: &#x453; => 2623783435333b
+&#xF3; => &#xF3;
+
+CYRILLIC SMALL LETTER UKRAINIAN IE: &#x454; => 2623783435343b
+&#xF4; => &#xF4;
+
+CYRILLIC SMALL LETTER DZE: &#x455; => 2623783435353b
+&#xF5; => &#xF5;
+
+CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I: &#x456; => 2623783435363b
+&#xF6; => &#xF6;
+
+CYRILLIC SMALL LETTER YI: &#x457; => 2623783435373b
+&#xF7; => &#xF7;
+
+CYRILLIC SMALL LETTER JE: &#x458; => 2623783435383b
+&#xF8; => &#xF8;
+
+CYRILLIC SMALL LETTER LJE: &#x459; => 2623783435393b
+&#xF9; => &#xF9;
+
+CYRILLIC SMALL LETTER NJE: &#x45A; => 2623783435413b
+&#xFA; => &#xFA;
+
+CYRILLIC SMALL LETTER TSHE: &#x45B; => 2623783435423b
+&#xFB; => &#xFB;
+
+CYRILLIC SMALL LETTER KJE: &#x45C; => 2623783435433b
+&#xFC; => &#xFC;
+
+SECTION SIGN: &#xA7; => fd
+&#xFD; => &#xFD;
+
+CYRILLIC SMALL LETTER SHORT U: &#x45E; => 2623783435453b
+&#xFE; => &#xFE;
+
+CYRILLIC SMALL LETTER DZHE: &#x45F; => 2623783435463b
+&#xFF; => &#xFF;
+
+
diff --git a/ext/standard/tests/strings/html_entity_decode_koi8-r.phpt b/ext/standard/tests/strings/html_entity_decode_koi8-r.phpt
new file mode 100644
index 0000000000..cb7fc7d1d8
--- /dev/null
+++ b/ext/standard/tests/strings/html_entity_decode_koi8-r.phpt
@@ -0,0 +1,533 @@
+--TEST--
+Translation of HTML entities for encoding KOI8-R
+--FILE--
+<?php
+$arr = array(
+0x2500 => array(0x80, "BOX DRAWINGS LIGHT HORIZONTAL"),
+0x2502 => array(0x81, "BOX DRAWINGS LIGHT VERTICAL"),
+0x250C => array(0x82, "BOX DRAWINGS LIGHT DOWN AND RIGHT"),
+0x2510 => array(0x83, "BOX DRAWINGS LIGHT DOWN AND LEFT"),
+0x2514 => array(0x84, "BOX DRAWINGS LIGHT UP AND RIGHT"),
+0x2518 => array(0x85, "BOX DRAWINGS LIGHT UP AND LEFT"),
+0x251C => array(0x86, "BOX DRAWINGS LIGHT VERTICAL AND RIGHT"),
+0x2524 => array(0x87, "BOX DRAWINGS LIGHT VERTICAL AND LEFT"),
+0x252C => array(0x88, "BOX DRAWINGS LIGHT DOWN AND HORIZONTAL"),
+0x2534 => array(0x89, "BOX DRAWINGS LIGHT UP AND HORIZONTAL"),
+0x253C => array(0x8A, "BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL"),
+0x2580 => array(0x8B, "UPPER HALF BLOCK"),
+0x2584 => array(0x8C, "LOWER HALF BLOCK"),
+0x2588 => array(0x8D, "FULL BLOCK"),
+0x258C => array(0x8E, "LEFT HALF BLOCK"),
+0x2590 => array(0x8F, "RIGHT HALF BLOCK"),
+0x2591 => array(0x90, "LIGHT SHADE"),
+0x2592 => array(0x91, "MEDIUM SHADE"),
+0x2593 => array(0x92, "DARK SHADE"),
+0x2320 => array(0x93, "TOP HALF INTEGRAL"),
+0x25A0 => array(0x94, "BLACK SQUARE"),
+0x2219 => array(0x95, "BULLET OPERATOR"),
+0x221A => array(0x96, "SQUARE ROOT"),
+0x2248 => array(0x97, "ALMOST EQUAL TO"),
+0x2264 => array(0x98, "LESS-THAN OR EQUAL TO"),
+0x2265 => array(0x99, "GREATER-THAN OR EQUAL TO"),
+0x00A0 => array(0x9A, "NO-BREAK SPACE"),
+0x2321 => array(0x9B, "BOTTOM HALF INTEGRAL"),
+0x00B0 => array(0x9C, "DEGREE SIGN"),
+0x00B2 => array(0x9D, "SUPERSCRIPT TWO"),
+0x00B7 => array(0x9E, "MIDDLE DOT"),
+0x00F7 => array(0x9F, "DIVISION SIGN"),
+0x2550 => array(0xA0, "BOX DRAWINGS DOUBLE HORIZONTAL"),
+0x2551 => array(0xA1, "BOX DRAWINGS DOUBLE VERTICAL"),
+0x2552 => array(0xA2, "BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE"),
+0x0451 => array(0xA3, "CYRILLIC SMALL LETTER IO"),
+0x2553 => array(0xA4, "BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE"),
+0x2554 => array(0xA5, "BOX DRAWINGS DOUBLE DOWN AND RIGHT"),
+0x2555 => array(0xA6, "BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE"),
+0x2556 => array(0xA7, "BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE"),
+0x2557 => array(0xA8, "BOX DRAWINGS DOUBLE DOWN AND LEFT"),
+0x2558 => array(0xA9, "BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE"),
+0x2559 => array(0xAA, "BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE"),
+0x255A => array(0xAB, "BOX DRAWINGS DOUBLE UP AND RIGHT"),
+0x255B => array(0xAC, "BOX DRAWINGS UP SINGLE AND LEFT DOUBLE"),
+0x255C => array(0xAD, "BOX DRAWINGS UP DOUBLE AND LEFT SINGLE"),
+0x255D => array(0xAE, "BOX DRAWINGS DOUBLE UP AND LEFT"),
+0x255E => array(0xAF, "BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE"),
+0x255F => array(0xB0, "BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE"),
+0x2560 => array(0xB1, "BOX DRAWINGS DOUBLE VERTICAL AND RIGHT"),
+0x2561 => array(0xB2, "BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE"),
+0x0401 => array(0xB3, "CYRILLIC CAPITAL LETTER IO"),
+0x2562 => array(0xB4, "BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE"),
+0x2563 => array(0xB5, "BOX DRAWINGS DOUBLE VERTICAL AND LEFT"),
+0x2564 => array(0xB6, "BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE"),
+0x2565 => array(0xB7, "BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE"),
+0x2566 => array(0xB8, "BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL"),
+0x2567 => array(0xB9, "BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE"),
+0x2568 => array(0xBA, "BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE"),
+0x2569 => array(0xBB, "BOX DRAWINGS DOUBLE UP AND HORIZONTAL"),
+0x256A => array(0xBC, "BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE"),
+0x256B => array(0xBD, "BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE"),
+0x256C => array(0xBE, "BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL"),
+0x00A9 => array(0xBF, "COPYRIGHT SIGN"),
+0x044E => array(0xC0, "CYRILLIC SMALL LETTER YU"),
+0x0430 => array(0xC1, "CYRILLIC SMALL LETTER A"),
+0x0431 => array(0xC2, "CYRILLIC SMALL LETTER BE"),
+0x0446 => array(0xC3, "CYRILLIC SMALL LETTER TSE"),
+0x0434 => array(0xC4, "CYRILLIC SMALL LETTER DE"),
+0x0435 => array(0xC5, "CYRILLIC SMALL LETTER IE"),
+0x0444 => array(0xC6, "CYRILLIC SMALL LETTER EF"),
+0x0433 => array(0xC7, "CYRILLIC SMALL LETTER GHE"),
+0x0445 => array(0xC8, "CYRILLIC SMALL LETTER HA"),
+0x0438 => array(0xC9, "CYRILLIC SMALL LETTER I"),
+0x0439 => array(0xCA, "CYRILLIC SMALL LETTER SHORT I"),
+0x043A => array(0xCB, "CYRILLIC SMALL LETTER KA"),
+0x043B => array(0xCC, "CYRILLIC SMALL LETTER EL"),
+0x043C => array(0xCD, "CYRILLIC SMALL LETTER EM"),
+0x043D => array(0xCE, "CYRILLIC SMALL LETTER EN"),
+0x043E => array(0xCF, "CYRILLIC SMALL LETTER O"),
+0x043F => array(0xD0, "CYRILLIC SMALL LETTER PE"),
+0x044F => array(0xD1, "CYRILLIC SMALL LETTER YA"),
+0x0440 => array(0xD2, "CYRILLIC SMALL LETTER ER"),
+0x0441 => array(0xD3, "CYRILLIC SMALL LETTER ES"),
+0x0442 => array(0xD4, "CYRILLIC SMALL LETTER TE"),
+0x0443 => array(0xD5, "CYRILLIC SMALL LETTER U"),
+0x0436 => array(0xD6, "CYRILLIC SMALL LETTER ZHE"),
+0x0432 => array(0xD7, "CYRILLIC SMALL LETTER VE"),
+0x044C => array(0xD8, "CYRILLIC SMALL LETTER SOFT SIGN"),
+0x044B => array(0xD9, "CYRILLIC SMALL LETTER YERU"),
+0x0437 => array(0xDA, "CYRILLIC SMALL LETTER ZE"),
+0x0448 => array(0xDB, "CYRILLIC SMALL LETTER SHA"),
+0x044D => array(0xDC, "CYRILLIC SMALL LETTER E"),
+0x0449 => array(0xDD, "CYRILLIC SMALL LETTER SHCHA"),
+0x0447 => array(0xDE, "CYRILLIC SMALL LETTER CHE"),
+0x044A => array(0xDF, "CYRILLIC SMALL LETTER HARD SIGN"),
+0x042E => array(0xE0, "CYRILLIC CAPITAL LETTER YU"),
+0x0410 => array(0xE1, "CYRILLIC CAPITAL LETTER A"),
+0x0411 => array(0xE2, "CYRILLIC CAPITAL LETTER BE"),
+0x0426 => array(0xE3, "CYRILLIC CAPITAL LETTER TSE"),
+0x0414 => array(0xE4, "CYRILLIC CAPITAL LETTER DE"),
+0x0415 => array(0xE5, "CYRILLIC CAPITAL LETTER IE"),
+0x0424 => array(0xE6, "CYRILLIC CAPITAL LETTER EF"),
+0x0413 => array(0xE7, "CYRILLIC CAPITAL LETTER GHE"),
+0x0425 => array(0xE8, "CYRILLIC CAPITAL LETTER HA"),
+0x0418 => array(0xE9, "CYRILLIC CAPITAL LETTER I"),
+0x0419 => array(0xEA, "CYRILLIC CAPITAL LETTER SHORT I"),
+0x041A => array(0xEB, "CYRILLIC CAPITAL LETTER KA"),
+0x041B => array(0xEC, "CYRILLIC CAPITAL LETTER EL"),
+0x041C => array(0xED, "CYRILLIC CAPITAL LETTER EM"),
+0x041D => array(0xEE, "CYRILLIC CAPITAL LETTER EN"),
+0x041E => array(0xEF, "CYRILLIC CAPITAL LETTER O"),
+0x041F => array(0xF0, "CYRILLIC CAPITAL LETTER PE"),
+0x042F => array(0xF1, "CYRILLIC CAPITAL LETTER YA"),
+0x0420 => array(0xF2, "CYRILLIC CAPITAL LETTER ER"),
+0x0421 => array(0xF3, "CYRILLIC CAPITAL LETTER ES"),
+0x0422 => array(0xF4, "CYRILLIC CAPITAL LETTER TE"),
+0x0423 => array(0xF5, "CYRILLIC CAPITAL LETTER U"),
+0x0416 => array(0xF6, "CYRILLIC CAPITAL LETTER ZHE"),
+0x0412 => array(0xF7, "CYRILLIC CAPITAL LETTER VE"),
+0x042C => array(0xF8, "CYRILLIC CAPITAL LETTER SOFT SIGN"),
+0x042B => array(0xF9, "CYRILLIC CAPITAL LETTER YERU"),
+0x0417 => array(0xFA, "CYRILLIC CAPITAL LETTER ZE"),
+0x0428 => array(0xFB, "CYRILLIC CAPITAL LETTER SHA"),
+0x042D => array(0xFC, "CYRILLIC CAPITAL LETTER E"),
+0x0429 => array(0xFD, "CYRILLIC CAPITAL LETTER SHCHA"),
+0x0427 => array(0xFE, "CYRILLIC CAPITAL LETTER CHE"),
+0x042A => array(0xFF, "CYRILLIC CAPITAL LETTER HARD SIGN"),
+);
+
+foreach ($arr as $u => $v) {
+ $ent = sprintf("&#x%X;", $u);
+ $res = html_entity_decode($ent, ENT_QUOTES, 'KOI8-R');
+ $d = unpack("H*", $res);
+ echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]);
+
+ $ent = sprintf("&#x%X;", $v[0]);
+ $res = html_entity_decode($ent, ENT_QUOTES, 'KOI8-R');
+ if ($res[0] != "&" || $res[1] != "#")
+ $res = unpack("H*", $res)[1];
+ echo sprintf("%s => %s\n\n", $ent, $res);
+}
+--EXPECT--
+BOX DRAWINGS LIGHT HORIZONTAL: &#x2500; => 80
+&#x80; => &#x80;
+
+BOX DRAWINGS LIGHT VERTICAL: &#x2502; => 81
+&#x81; => &#x81;
+
+BOX DRAWINGS LIGHT DOWN AND RIGHT: &#x250C; => 82
+&#x82; => &#x82;
+
+BOX DRAWINGS LIGHT DOWN AND LEFT: &#x2510; => 83
+&#x83; => &#x83;
+
+BOX DRAWINGS LIGHT UP AND RIGHT: &#x2514; => 84
+&#x84; => &#x84;
+
+BOX DRAWINGS LIGHT UP AND LEFT: &#x2518; => 85
+&#x85; => &#x85;
+
+BOX DRAWINGS LIGHT VERTICAL AND RIGHT: &#x251C; => 86
+&#x86; => &#x86;
+
+BOX DRAWINGS LIGHT VERTICAL AND LEFT: &#x2524; => 87
+&#x87; => &#x87;
+
+BOX DRAWINGS LIGHT DOWN AND HORIZONTAL: &#x252C; => 88
+&#x88; => &#x88;
+
+BOX DRAWINGS LIGHT UP AND HORIZONTAL: &#x2534; => 89
+&#x89; => &#x89;
+
+BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL: &#x253C; => 8a
+&#x8A; => &#x8A;
+
+UPPER HALF BLOCK: &#x2580; => 8b
+&#x8B; => &#x8B;
+
+LOWER HALF BLOCK: &#x2584; => 8c
+&#x8C; => &#x8C;
+
+FULL BLOCK: &#x2588; => 8d
+&#x8D; => &#x8D;
+
+LEFT HALF BLOCK: &#x258C; => 8e
+&#x8E; => &#x8E;
+
+RIGHT HALF BLOCK: &#x2590; => 8f
+&#x8F; => &#x8F;
+
+LIGHT SHADE: &#x2591; => 90
+&#x90; => &#x90;
+
+MEDIUM SHADE: &#x2592; => 91
+&#x91; => &#x91;
+
+DARK SHADE: &#x2593; => 92
+&#x92; => &#x92;
+
+TOP HALF INTEGRAL: &#x2320; => 93
+&#x93; => &#x93;
+
+BLACK SQUARE: &#x25A0; => 94
+&#x94; => &#x94;
+
+BULLET OPERATOR: &#x2219; => 95
+&#x95; => &#x95;
+
+SQUARE ROOT: &#x221A; => 96
+&#x96; => &#x96;
+
+ALMOST EQUAL TO: &#x2248; => 97
+&#x97; => &#x97;
+
+LESS-THAN OR EQUAL TO: &#x2264; => 98
+&#x98; => &#x98;
+
+GREATER-THAN OR EQUAL TO: &#x2265; => 99
+&#x99; => &#x99;
+
+NO-BREAK SPACE: &#xA0; => 9a
+&#x9A; => &#x9A;
+
+BOTTOM HALF INTEGRAL: &#x2321; => 9b
+&#x9B; => &#x9B;
+
+DEGREE SIGN: &#xB0; => 9c
+&#x9C; => &#x9C;
+
+SUPERSCRIPT TWO: &#xB2; => 9d
+&#x9D; => &#x9D;
+
+MIDDLE DOT: &#xB7; => 9e
+&#x9E; => &#x9E;
+
+DIVISION SIGN: &#xF7; => 9f
+&#x9F; => &#x9F;
+
+BOX DRAWINGS DOUBLE HORIZONTAL: &#x2550; => a0
+&#xA0; => 9a
+
+BOX DRAWINGS DOUBLE VERTICAL: &#x2551; => a1
+&#xA1; => &#xA1;
+
+BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE: &#x2552; => a2
+&#xA2; => &#xA2;
+
+CYRILLIC SMALL LETTER IO: &#x451; => a3
+&#xA3; => &#xA3;
+
+BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE: &#x2553; => a4
+&#xA4; => &#xA4;
+
+BOX DRAWINGS DOUBLE DOWN AND RIGHT: &#x2554; => a5
+&#xA5; => &#xA5;
+
+BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE: &#x2555; => a6
+&#xA6; => &#xA6;
+
+BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE: &#x2556; => a7
+&#xA7; => &#xA7;
+
+BOX DRAWINGS DOUBLE DOWN AND LEFT: &#x2557; => a8
+&#xA8; => &#xA8;
+
+BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE: &#x2558; => a9
+&#xA9; => bf
+
+BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE: &#x2559; => aa
+&#xAA; => &#xAA;
+
+BOX DRAWINGS DOUBLE UP AND RIGHT: &#x255A; => ab
+&#xAB; => &#xAB;
+
+BOX DRAWINGS UP SINGLE AND LEFT DOUBLE: &#x255B; => ac
+&#xAC; => &#xAC;
+
+BOX DRAWINGS UP DOUBLE AND LEFT SINGLE: &#x255C; => ad
+&#xAD; => &#xAD;
+
+BOX DRAWINGS DOUBLE UP AND LEFT: &#x255D; => ae
+&#xAE; => &#xAE;
+
+BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE: &#x255E; => af
+&#xAF; => &#xAF;
+
+BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE: &#x255F; => b0
+&#xB0; => 9c
+
+BOX DRAWINGS DOUBLE VERTICAL AND RIGHT: &#x2560; => b1
+&#xB1; => &#xB1;
+
+BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE: &#x2561; => b2
+&#xB2; => 9d
+
+CYRILLIC CAPITAL LETTER IO: &#x401; => b3
+&#xB3; => &#xB3;
+
+BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE: &#x2562; => b4
+&#xB4; => &#xB4;
+
+BOX DRAWINGS DOUBLE VERTICAL AND LEFT: &#x2563; => b5
+&#xB5; => &#xB5;
+
+BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE: &#x2564; => b6
+&#xB6; => &#xB6;
+
+BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE: &#x2565; => b7
+&#xB7; => 9e
+
+BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL: &#x2566; => b8
+&#xB8; => &#xB8;
+
+BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE: &#x2567; => b9
+&#xB9; => &#xB9;
+
+BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE: &#x2568; => ba
+&#xBA; => &#xBA;
+
+BOX DRAWINGS DOUBLE UP AND HORIZONTAL: &#x2569; => bb
+&#xBB; => &#xBB;
+
+BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE: &#x256A; => bc
+&#xBC; => &#xBC;
+
+BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE: &#x256B; => bd
+&#xBD; => &#xBD;
+
+BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL: &#x256C; => be
+&#xBE; => &#xBE;
+
+COPYRIGHT SIGN: &#xA9; => bf
+&#xBF; => &#xBF;
+
+CYRILLIC SMALL LETTER YU: &#x44E; => c0
+&#xC0; => &#xC0;
+
+CYRILLIC SMALL LETTER A: &#x430; => c1
+&#xC1; => &#xC1;
+
+CYRILLIC SMALL LETTER BE: &#x431; => c2
+&#xC2; => &#xC2;
+
+CYRILLIC SMALL LETTER TSE: &#x446; => c3
+&#xC3; => &#xC3;
+
+CYRILLIC SMALL LETTER DE: &#x434; => c4
+&#xC4; => &#xC4;
+
+CYRILLIC SMALL LETTER IE: &#x435; => c5
+&#xC5; => &#xC5;
+
+CYRILLIC SMALL LETTER EF: &#x444; => c6
+&#xC6; => &#xC6;
+
+CYRILLIC SMALL LETTER GHE: &#x433; => c7
+&#xC7; => &#xC7;
+
+CYRILLIC SMALL LETTER HA: &#x445; => c8
+&#xC8; => &#xC8;
+
+CYRILLIC SMALL LETTER I: &#x438; => c9
+&#xC9; => &#xC9;
+
+CYRILLIC SMALL LETTER SHORT I: &#x439; => ca
+&#xCA; => &#xCA;
+
+CYRILLIC SMALL LETTER KA: &#x43A; => cb
+&#xCB; => &#xCB;
+
+CYRILLIC SMALL LETTER EL: &#x43B; => cc
+&#xCC; => &#xCC;
+
+CYRILLIC SMALL LETTER EM: &#x43C; => cd
+&#xCD; => &#xCD;
+
+CYRILLIC SMALL LETTER EN: &#x43D; => ce
+&#xCE; => &#xCE;
+
+CYRILLIC SMALL LETTER O: &#x43E; => cf
+&#xCF; => &#xCF;
+
+CYRILLIC SMALL LETTER PE: &#x43F; => d0
+&#xD0; => &#xD0;
+
+CYRILLIC SMALL LETTER YA: &#x44F; => d1
+&#xD1; => &#xD1;
+
+CYRILLIC SMALL LETTER ER: &#x440; => d2
+&#xD2; => &#xD2;
+
+CYRILLIC SMALL LETTER ES: &#x441; => d3
+&#xD3; => &#xD3;
+
+CYRILLIC SMALL LETTER TE: &#x442; => d4
+&#xD4; => &#xD4;
+
+CYRILLIC SMALL LETTER U: &#x443; => d5
+&#xD5; => &#xD5;
+
+CYRILLIC SMALL LETTER ZHE: &#x436; => d6
+&#xD6; => &#xD6;
+
+CYRILLIC SMALL LETTER VE: &#x432; => d7
+&#xD7; => &#xD7;
+
+CYRILLIC SMALL LETTER SOFT SIGN: &#x44C; => d8
+&#xD8; => &#xD8;
+
+CYRILLIC SMALL LETTER YERU: &#x44B; => d9
+&#xD9; => &#xD9;
+
+CYRILLIC SMALL LETTER ZE: &#x437; => da
+&#xDA; => &#xDA;
+
+CYRILLIC SMALL LETTER SHA: &#x448; => db
+&#xDB; => &#xDB;
+
+CYRILLIC SMALL LETTER E: &#x44D; => dc
+&#xDC; => &#xDC;
+
+CYRILLIC SMALL LETTER SHCHA: &#x449; => dd
+&#xDD; => &#xDD;
+
+CYRILLIC SMALL LETTER CHE: &#x447; => de
+&#xDE; => &#xDE;
+
+CYRILLIC SMALL LETTER HARD SIGN: &#x44A; => df
+&#xDF; => &#xDF;
+
+CYRILLIC CAPITAL LETTER YU: &#x42E; => e0
+&#xE0; => &#xE0;
+
+CYRILLIC CAPITAL LETTER A: &#x410; => e1
+&#xE1; => &#xE1;
+
+CYRILLIC CAPITAL LETTER BE: &#x411; => e2
+&#xE2; => &#xE2;
+
+CYRILLIC CAPITAL LETTER TSE: &#x426; => e3
+&#xE3; => &#xE3;
+
+CYRILLIC CAPITAL LETTER DE: &#x414; => e4
+&#xE4; => &#xE4;
+
+CYRILLIC CAPITAL LETTER IE: &#x415; => e5
+&#xE5; => &#xE5;
+
+CYRILLIC CAPITAL LETTER EF: &#x424; => e6
+&#xE6; => &#xE6;
+
+CYRILLIC CAPITAL LETTER GHE: &#x413; => e7
+&#xE7; => &#xE7;
+
+CYRILLIC CAPITAL LETTER HA: &#x425; => e8
+&#xE8; => &#xE8;
+
+CYRILLIC CAPITAL LETTER I: &#x418; => e9
+&#xE9; => &#xE9;
+
+CYRILLIC CAPITAL LETTER SHORT I: &#x419; => ea
+&#xEA; => &#xEA;
+
+CYRILLIC CAPITAL LETTER KA: &#x41A; => eb
+&#xEB; => &#xEB;
+
+CYRILLIC CAPITAL LETTER EL: &#x41B; => ec
+&#xEC; => &#xEC;
+
+CYRILLIC CAPITAL LETTER EM: &#x41C; => ed
+&#xED; => &#xED;
+
+CYRILLIC CAPITAL LETTER EN: &#x41D; => ee
+&#xEE; => &#xEE;
+
+CYRILLIC CAPITAL LETTER O: &#x41E; => ef
+&#xEF; => &#xEF;
+
+CYRILLIC CAPITAL LETTER PE: &#x41F; => f0
+&#xF0; => &#xF0;
+
+CYRILLIC CAPITAL LETTER YA: &#x42F; => f1
+&#xF1; => &#xF1;
+
+CYRILLIC CAPITAL LETTER ER: &#x420; => f2
+&#xF2; => &#xF2;
+
+CYRILLIC CAPITAL LETTER ES: &#x421; => f3
+&#xF3; => &#xF3;
+
+CYRILLIC CAPITAL LETTER TE: &#x422; => f4
+&#xF4; => &#xF4;
+
+CYRILLIC CAPITAL LETTER U: &#x423; => f5
+&#xF5; => &#xF5;
+
+CYRILLIC CAPITAL LETTER ZHE: &#x416; => f6
+&#xF6; => &#xF6;
+
+CYRILLIC CAPITAL LETTER VE: &#x412; => f7
+&#xF7; => 9f
+
+CYRILLIC CAPITAL LETTER SOFT SIGN: &#x42C; => f8
+&#xF8; => &#xF8;
+
+CYRILLIC CAPITAL LETTER YERU: &#x42B; => f9
+&#xF9; => &#xF9;
+
+CYRILLIC CAPITAL LETTER ZE: &#x417; => fa
+&#xFA; => &#xFA;
+
+CYRILLIC CAPITAL LETTER SHA: &#x428; => fb
+&#xFB; => &#xFB;
+
+CYRILLIC CAPITAL LETTER E: &#x42D; => fc
+&#xFC; => &#xFC;
+
+CYRILLIC CAPITAL LETTER SHCHA: &#x429; => fd
+&#xFD; => &#xFD;
+
+CYRILLIC CAPITAL LETTER CHE: &#x427; => fe
+&#xFE; => &#xFE;
+
+CYRILLIC CAPITAL LETTER HARD SIGN: &#x42A; => ff
+&#xFF; => &#xFF;
+
+
diff --git a/ext/standard/tests/strings/html_entity_decode_macroman.phpt b/ext/standard/tests/strings/html_entity_decode_macroman.phpt
new file mode 100644
index 0000000000..4691bcf1a7
--- /dev/null
+++ b/ext/standard/tests/strings/html_entity_decode_macroman.phpt
@@ -0,0 +1,540 @@
+--TEST--
+Translation of HTML entities for encoding MacRoman
+--FILE--
+<?php
+$arr = array(
+0x00C4 => array(0x80, "LATIN CAPITAL LETTER A WITH DIAERESIS"),
+0x00C5 => array(0x81, "LATIN CAPITAL LETTER A WITH RING ABOVE"),
+0x00C7 => array(0x82, "LATIN CAPITAL LETTER C WITH CEDILLA"),
+0x00C9 => array(0x83, "LATIN CAPITAL LETTER E WITH ACUTE"),
+0x00D1 => array(0x84, "LATIN CAPITAL LETTER N WITH TILDE"),
+0x00D6 => array(0x85, "LATIN CAPITAL LETTER O WITH DIAERESIS"),
+0x00DC => array(0x86, "LATIN CAPITAL LETTER U WITH DIAERESIS"),
+0x00E1 => array(0x87, "LATIN SMALL LETTER A WITH ACUTE"),
+0x00E0 => array(0x88, "LATIN SMALL LETTER A WITH GRAVE"),
+0x00E2 => array(0x89, "LATIN SMALL LETTER A WITH CIRCUMFLEX"),
+0x00E4 => array(0x8A, "LATIN SMALL LETTER A WITH DIAERESIS"),
+0x00E3 => array(0x8B, "LATIN SMALL LETTER A WITH TILDE"),
+0x00E5 => array(0x8C, "LATIN SMALL LETTER A WITH RING ABOVE"),
+0x00E7 => array(0x8D, "LATIN SMALL LETTER C WITH CEDILLA"),
+0x00E9 => array(0x8E, "LATIN SMALL LETTER E WITH ACUTE"),
+0x00E8 => array(0x8F, "LATIN SMALL LETTER E WITH GRAVE"),
+0x00EA => array(0x90, "LATIN SMALL LETTER E WITH CIRCUMFLEX"),
+0x00EB => array(0x91, "LATIN SMALL LETTER E WITH DIAERESIS"),
+0x00ED => array(0x92, "LATIN SMALL LETTER I WITH ACUTE"),
+0x00EC => array(0x93, "LATIN SMALL LETTER I WITH GRAVE"),
+0x00EE => array(0x94, "LATIN SMALL LETTER I WITH CIRCUMFLEX"),
+0x00EF => array(0x95, "LATIN SMALL LETTER I WITH DIAERESIS"),
+0x00F1 => array(0x96, "LATIN SMALL LETTER N WITH TILDE"),
+0x00F3 => array(0x97, "LATIN SMALL LETTER O WITH ACUTE"),
+0x00F2 => array(0x98, "LATIN SMALL LETTER O WITH GRAVE"),
+0x00F4 => array(0x99, "LATIN SMALL LETTER O WITH CIRCUMFLEX"),
+0x00F6 => array(0x9A, "LATIN SMALL LETTER O WITH DIAERESIS"),
+0x00F5 => array(0x9B, "LATIN SMALL LETTER O WITH TILDE"),
+0x00FA => array(0x9C, "LATIN SMALL LETTER U WITH ACUTE"),
+0x00F9 => array(0x9D, "LATIN SMALL LETTER U WITH GRAVE"),
+0x00FB => array(0x9E, "LATIN SMALL LETTER U WITH CIRCUMFLEX"),
+0x00FC => array(0x9F, "LATIN SMALL LETTER U WITH DIAERESIS"),
+0x2020 => array(0xA0, "DAGGER"),
+0x00B0 => array(0xA1, "DEGREE SIGN"),
+0x00A2 => array(0xA2, "CENT SIGN"),
+0x00A3 => array(0xA3, "POUND SIGN"),
+0x00A7 => array(0xA4, "SECTION SIGN"),
+0x2022 => array(0xA5, "BULLET"),
+0x00B6 => array(0xA6, "PILCROW SIGN"),
+0x00DF => array(0xA7, "LATIN SMALL LETTER SHARP S"),
+0x00AE => array(0xA8, "REGISTERED SIGN"),
+0x00A9 => array(0xA9, "COPYRIGHT SIGN"),
+0x2122 => array(0xAA, "TRADE MARK SIGN"),
+0x00B4 => array(0xAB, "ACUTE ACCENT"),
+0x00A8 => array(0xAC, "DIAERESIS"),
+0x2260 => array(0xAD, "NOT EQUAL TO"),
+0x00C6 => array(0xAE, "LATIN CAPITAL LETTER AE"),
+0x00D8 => array(0xAF, "LATIN CAPITAL LETTER O WITH STROKE"),
+0x221E => array(0xB0, "INFINITY"),
+0x00B1 => array(0xB1, "PLUS-MINUS SIGN"),
+0x2264 => array(0xB2, "LESS-THAN OR EQUAL TO"),
+0x2265 => array(0xB3, "GREATER-THAN OR EQUAL TO"),
+0x00A5 => array(0xB4, "YEN SIGN"),
+0x00B5 => array(0xB5, "MICRO SIGN"),
+0x2202 => array(0xB6, "PARTIAL DIFFERENTIAL"),
+0x2211 => array(0xB7, "N-ARY SUMMATION"),
+0x220F => array(0xB8, "N-ARY PRODUCT"),
+0x03C0 => array(0xB9, "GREEK SMALL LETTER PI"),
+0x222B => array(0xBA, "INTEGRAL"),
+0x00AA => array(0xBB, "FEMININE ORDINAL INDICATOR"),
+0x00BA => array(0xBC, "MASCULINE ORDINAL INDICATOR"),
+0x03A9 => array(0xBD, "GREEK CAPITAL LETTER OMEGA"),
+0x00E6 => array(0xBE, "LATIN SMALL LETTER AE"),
+0x00F8 => array(0xBF, "LATIN SMALL LETTER O WITH STROKE"),
+0x00BF => array(0xC0, "INVERTED QUESTION MARK"),
+0x00A1 => array(0xC1, "INVERTED EXCLAMATION MARK"),
+0x00AC => array(0xC2, "NOT SIGN"),
+0x221A => array(0xC3, "SQUARE ROOT"),
+0x0192 => array(0xC4, "LATIN SMALL LETTER F WITH HOOK"),
+0x2248 => array(0xC5, "ALMOST EQUAL TO"),
+0x2206 => array(0xC6, "INCREMENT"),
+0x00AB => array(0xC7, "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK"),
+0x00BB => array(0xC8, "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK"),
+0x2026 => array(0xC9, "HORIZONTAL ELLIPSIS"),
+0x00A0 => array(0xCA, "NO-BREAK SPACE"),
+0x00C0 => array(0xCB, "LATIN CAPITAL LETTER A WITH GRAVE"),
+0x00C3 => array(0xCC, "LATIN CAPITAL LETTER A WITH TILDE"),
+0x00D5 => array(0xCD, "LATIN CAPITAL LETTER O WITH TILDE"),
+0x0152 => array(0xCE, "LATIN CAPITAL LIGATURE OE"),
+0x0153 => array(0xCF, "LATIN SMALL LIGATURE OE"),
+0x2013 => array(0xD0, "EN DASH"),
+0x2014 => array(0xD1, "EM DASH"),
+0x201C => array(0xD2, "LEFT DOUBLE QUOTATION MARK"),
+0x201D => array(0xD3, "RIGHT DOUBLE QUOTATION MARK"),
+0x2018 => array(0xD4, "LEFT SINGLE QUOTATION MARK"),
+0x2019 => array(0xD5, "RIGHT SINGLE QUOTATION MARK"),
+0x00F7 => array(0xD6, "DIVISION SIGN"),
+0x25CA => array(0xD7, "LOZENGE"),
+0x00FF => array(0xD8, "LATIN SMALL LETTER Y WITH DIAERESIS"),
+0x0178 => array(0xD9, "LATIN CAPITAL LETTER Y WITH DIAERESIS"),
+0x2044 => array(0xDA, "FRACTION SLASH"),
+0x20AC => array(0xDB, "EURO SIGN"),
+0x2039 => array(0xDC, "SINGLE LEFT-POINTING ANGLE QUOTATION MARK"),
+0x203A => array(0xDD, "SINGLE RIGHT-POINTING ANGLE QUOTATION MARK"),
+0xFB01 => array(0xDE, "LATIN SMALL LIGATURE FI"),
+0xFB02 => array(0xDF, "LATIN SMALL LIGATURE FL"),
+0x2021 => array(0xE0, "DOUBLE DAGGER"),
+0x00B7 => array(0xE1, "MIDDLE DOT"),
+0x201A => array(0xE2, "SINGLE LOW-9 QUOTATION MARK"),
+0x201E => array(0xE3, "DOUBLE LOW-9 QUOTATION MARK"),
+0x2030 => array(0xE4, "PER MILLE SIGN"),
+0x00C2 => array(0xE5, "LATIN CAPITAL LETTER A WITH CIRCUMFLEX"),
+0x00CA => array(0xE6, "LATIN CAPITAL LETTER E WITH CIRCUMFLEX"),
+0x00C1 => array(0xE7, "LATIN CAPITAL LETTER A WITH ACUTE"),
+0x00CB => array(0xE8, "LATIN CAPITAL LETTER E WITH DIAERESIS"),
+0x00C8 => array(0xE9, "LATIN CAPITAL LETTER E WITH GRAVE"),
+0x00CD => array(0xEA, "LATIN CAPITAL LETTER I WITH ACUTE"),
+0x00CE => array(0xEB, "LATIN CAPITAL LETTER I WITH CIRCUMFLEX"),
+0x00CF => array(0xEC, "LATIN CAPITAL LETTER I WITH DIAERESIS"),
+0x00CC => array(0xED, "LATIN CAPITAL LETTER I WITH GRAVE"),
+0x00D3 => array(0xEE, "LATIN CAPITAL LETTER O WITH ACUTE"),
+0x00D4 => array(0xEF, "LATIN CAPITAL LETTER O WITH CIRCUMFLEX"),
+0xF8FF => array(0xF0, "Apple logo"),
+0x00D2 => array(0xF1, "LATIN CAPITAL LETTER O WITH GRAVE"),
+0x00DA => array(0xF2, "LATIN CAPITAL LETTER U WITH ACUTE"),
+0x00DB => array(0xF3, "LATIN CAPITAL LETTER U WITH CIRCUMFLEX"),
+0x00D9 => array(0xF4, "LATIN CAPITAL LETTER U WITH GRAVE"),
+0x0131 => array(0xF5, "LATIN SMALL LETTER DOTLESS I"),
+0x02C6 => array(0xF6, "MODIFIER LETTER CIRCUMFLEX ACCENT"),
+0x02DC => array(0xF7, "SMALL TILDE"),
+0x00AF => array(0xF8, "MACRON"),
+0x02D8 => array(0xF9, "BREVE"),
+0x02D9 => array(0xFA, "DOT ABOVE"),
+0x02DA => array(0xFB, "RING ABOVE"),
+0x00B8 => array(0xFC, "CEDILLA"),
+0x02DD => array(0xFD, "DOUBLE ACUTE ACCENT"),
+0x02DB => array(0xFE, "OGONEK"),
+0x02C7 => array(0xFF, "CARON"),
+);
+
+$res = html_entity_decode("&#x7F;", ENT_QUOTES, 'MacRoman');
+echo "Special test for &#x7F; (shouldn't decode):\n";
+echo $res,"\n\n";
+
+foreach ($arr as $u => $v) {
+ $ent = sprintf("&#x%X;", $u);
+ $res = html_entity_decode($ent, ENT_QUOTES, 'MacRoman');
+ $d = unpack("H*", $res);
+ echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]);
+
+ $ent = sprintf("&#x%X;", $v[0]);
+ $res = html_entity_decode($ent, ENT_QUOTES, 'MacRoman');
+ if ($res[0] != "&" || $res[1] != "#")
+ $res = unpack("H*", $res)[1];
+ echo sprintf("%s => %s\n\n", $ent, $res);
+}
+--EXPECT--
+Special test for &#x7F; (shouldn't decode):
+&#x7F;
+
+LATIN CAPITAL LETTER A WITH DIAERESIS: &#xC4; => 80
+&#x80; => &#x80;
+
+LATIN CAPITAL LETTER A WITH RING ABOVE: &#xC5; => 81
+&#x81; => &#x81;
+
+LATIN CAPITAL LETTER C WITH CEDILLA: &#xC7; => 82
+&#x82; => &#x82;
+
+LATIN CAPITAL LETTER E WITH ACUTE: &#xC9; => 83
+&#x83; => &#x83;
+
+LATIN CAPITAL LETTER N WITH TILDE: &#xD1; => 84
+&#x84; => &#x84;
+
+LATIN CAPITAL LETTER O WITH DIAERESIS: &#xD6; => 85
+&#x85; => &#x85;
+
+LATIN CAPITAL LETTER U WITH DIAERESIS: &#xDC; => 86
+&#x86; => &#x86;
+
+LATIN SMALL LETTER A WITH ACUTE: &#xE1; => 87
+&#x87; => &#x87;
+
+LATIN SMALL LETTER A WITH GRAVE: &#xE0; => 88
+&#x88; => &#x88;
+
+LATIN SMALL LETTER A WITH CIRCUMFLEX: &#xE2; => 89
+&#x89; => &#x89;
+
+LATIN SMALL LETTER A WITH DIAERESIS: &#xE4; => 8a
+&#x8A; => &#x8A;
+
+LATIN SMALL LETTER A WITH TILDE: &#xE3; => 8b
+&#x8B; => &#x8B;
+
+LATIN SMALL LETTER A WITH RING ABOVE: &#xE5; => 8c
+&#x8C; => &#x8C;
+
+LATIN SMALL LETTER C WITH CEDILLA: &#xE7; => 8d
+&#x8D; => &#x8D;
+
+LATIN SMALL LETTER E WITH ACUTE: &#xE9; => 8e
+&#x8E; => &#x8E;
+
+LATIN SMALL LETTER E WITH GRAVE: &#xE8; => 8f
+&#x8F; => &#x8F;
+
+LATIN SMALL LETTER E WITH CIRCUMFLEX: &#xEA; => 90
+&#x90; => &#x90;
+
+LATIN SMALL LETTER E WITH DIAERESIS: &#xEB; => 91
+&#x91; => &#x91;
+
+LATIN SMALL LETTER I WITH ACUTE: &#xED; => 92
+&#x92; => &#x92;
+
+LATIN SMALL LETTER I WITH GRAVE: &#xEC; => 93
+&#x93; => &#x93;
+
+LATIN SMALL LETTER I WITH CIRCUMFLEX: &#xEE; => 94
+&#x94; => &#x94;
+
+LATIN SMALL LETTER I WITH DIAERESIS: &#xEF; => 95
+&#x95; => &#x95;
+
+LATIN SMALL LETTER N WITH TILDE: &#xF1; => 96
+&#x96; => &#x96;
+
+LATIN SMALL LETTER O WITH ACUTE: &#xF3; => 97
+&#x97; => &#x97;
+
+LATIN SMALL LETTER O WITH GRAVE: &#xF2; => 98
+&#x98; => &#x98;
+
+LATIN SMALL LETTER O WITH CIRCUMFLEX: &#xF4; => 99
+&#x99; => &#x99;
+
+LATIN SMALL LETTER O WITH DIAERESIS: &#xF6; => 9a
+&#x9A; => &#x9A;
+
+LATIN SMALL LETTER O WITH TILDE: &#xF5; => 9b
+&#x9B; => &#x9B;
+
+LATIN SMALL LETTER U WITH ACUTE: &#xFA; => 9c
+&#x9C; => &#x9C;
+
+LATIN SMALL LETTER U WITH GRAVE: &#xF9; => 9d
+&#x9D; => &#x9D;
+
+LATIN SMALL LETTER U WITH CIRCUMFLEX: &#xFB; => 9e
+&#x9E; => &#x9E;
+
+LATIN SMALL LETTER U WITH DIAERESIS: &#xFC; => 9f
+&#x9F; => &#x9F;
+
+DAGGER: &#x2020; => a0
+&#xA0; => ca
+
+DEGREE SIGN: &#xB0; => a1
+&#xA1; => c1
+
+CENT SIGN: &#xA2; => a2
+&#xA2; => a2
+
+POUND SIGN: &#xA3; => a3
+&#xA3; => a3
+
+SECTION SIGN: &#xA7; => a4
+&#xA4; => &#xA4;
+
+BULLET: &#x2022; => a5
+&#xA5; => b4
+
+PILCROW SIGN: &#xB6; => a6
+&#xA6; => &#xA6;
+
+LATIN SMALL LETTER SHARP S: &#xDF; => a7
+&#xA7; => a4
+
+REGISTERED SIGN: &#xAE; => a8
+&#xA8; => ac
+
+COPYRIGHT SIGN: &#xA9; => a9
+&#xA9; => a9
+
+TRADE MARK SIGN: &#x2122; => aa
+&#xAA; => bb
+
+ACUTE ACCENT: &#xB4; => ab
+&#xAB; => c7
+
+DIAERESIS: &#xA8; => ac
+&#xAC; => c2
+
+NOT EQUAL TO: &#x2260; => ad
+&#xAD; => &#xAD;
+
+LATIN CAPITAL LETTER AE: &#xC6; => ae
+&#xAE; => a8
+
+LATIN CAPITAL LETTER O WITH STROKE: &#xD8; => af
+&#xAF; => f8
+
+INFINITY: &#x221E; => b0
+&#xB0; => a1
+
+PLUS-MINUS SIGN: &#xB1; => b1
+&#xB1; => b1
+
+LESS-THAN OR EQUAL TO: &#x2264; => b2
+&#xB2; => &#xB2;
+
+GREATER-THAN OR EQUAL TO: &#x2265; => b3
+&#xB3; => &#xB3;
+
+YEN SIGN: &#xA5; => b4
+&#xB4; => ab
+
+MICRO SIGN: &#xB5; => b5
+&#xB5; => b5
+
+PARTIAL DIFFERENTIAL: &#x2202; => b6
+&#xB6; => a6
+
+N-ARY SUMMATION: &#x2211; => b7
+&#xB7; => e1
+
+N-ARY PRODUCT: &#x220F; => b8
+&#xB8; => fc
+
+GREEK SMALL LETTER PI: &#x3C0; => b9
+&#xB9; => &#xB9;
+
+INTEGRAL: &#x222B; => ba
+&#xBA; => bc
+
+FEMININE ORDINAL INDICATOR: &#xAA; => bb
+&#xBB; => c8
+
+MASCULINE ORDINAL INDICATOR: &#xBA; => bc
+&#xBC; => &#xBC;
+
+GREEK CAPITAL LETTER OMEGA: &#x3A9; => bd
+&#xBD; => &#xBD;
+
+LATIN SMALL LETTER AE: &#xE6; => be
+&#xBE; => &#xBE;
+
+LATIN SMALL LETTER O WITH STROKE: &#xF8; => bf
+&#xBF; => c0
+
+INVERTED QUESTION MARK: &#xBF; => c0
+&#xC0; => cb
+
+INVERTED EXCLAMATION MARK: &#xA1; => c1
+&#xC1; => e7
+
+NOT SIGN: &#xAC; => c2
+&#xC2; => e5
+
+SQUARE ROOT: &#x221A; => c3
+&#xC3; => cc
+
+LATIN SMALL LETTER F WITH HOOK: &#x192; => c4
+&#xC4; => 80
+
+ALMOST EQUAL TO: &#x2248; => c5
+&#xC5; => 81
+
+INCREMENT: &#x2206; => c6
+&#xC6; => ae
+
+LEFT-POINTING DOUBLE ANGLE QUOTATION MARK: &#xAB; => c7
+&#xC7; => 82
+
+RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK: &#xBB; => c8
+&#xC8; => e9
+
+HORIZONTAL ELLIPSIS: &#x2026; => c9
+&#xC9; => 83
+
+NO-BREAK SPACE: &#xA0; => ca
+&#xCA; => e6
+
+LATIN CAPITAL LETTER A WITH GRAVE: &#xC0; => cb
+&#xCB; => e8
+
+LATIN CAPITAL LETTER A WITH TILDE: &#xC3; => cc
+&#xCC; => ed
+
+LATIN CAPITAL LETTER O WITH TILDE: &#xD5; => cd
+&#xCD; => ea
+
+LATIN CAPITAL LIGATURE OE: &#x152; => ce
+&#xCE; => eb
+
+LATIN SMALL LIGATURE OE: &#x153; => cf
+&#xCF; => ec
+
+EN DASH: &#x2013; => d0
+&#xD0; => &#xD0;
+
+EM DASH: &#x2014; => d1
+&#xD1; => 84
+
+LEFT DOUBLE QUOTATION MARK: &#x201C; => d2
+&#xD2; => f1
+
+RIGHT DOUBLE QUOTATION MARK: &#x201D; => d3
+&#xD3; => ee
+
+LEFT SINGLE QUOTATION MARK: &#x2018; => d4
+&#xD4; => ef
+
+RIGHT SINGLE QUOTATION MARK: &#x2019; => d5
+&#xD5; => cd
+
+DIVISION SIGN: &#xF7; => d6
+&#xD6; => 85
+
+LOZENGE: &#x25CA; => d7
+&#xD7; => &#xD7;
+
+LATIN SMALL LETTER Y WITH DIAERESIS: &#xFF; => d8
+&#xD8; => af
+
+LATIN CAPITAL LETTER Y WITH DIAERESIS: &#x178; => d9
+&#xD9; => f4
+
+FRACTION SLASH: &#x2044; => da
+&#xDA; => f2
+
+EURO SIGN: &#x20AC; => db
+&#xDB; => f3
+
+SINGLE LEFT-POINTING ANGLE QUOTATION MARK: &#x2039; => dc
+&#xDC; => 86
+
+SINGLE RIGHT-POINTING ANGLE QUOTATION MARK: &#x203A; => dd
+&#xDD; => &#xDD;
+
+LATIN SMALL LIGATURE FI: &#xFB01; => de
+&#xDE; => &#xDE;
+
+LATIN SMALL LIGATURE FL: &#xFB02; => df
+&#xDF; => a7
+
+DOUBLE DAGGER: &#x2021; => e0
+&#xE0; => 88
+
+MIDDLE DOT: &#xB7; => e1
+&#xE1; => 87
+
+SINGLE LOW-9 QUOTATION MARK: &#x201A; => e2
+&#xE2; => 89
+
+DOUBLE LOW-9 QUOTATION MARK: &#x201E; => e3
+&#xE3; => 8b
+
+PER MILLE SIGN: &#x2030; => e4
+&#xE4; => 8a
+
+LATIN CAPITAL LETTER A WITH CIRCUMFLEX: &#xC2; => e5
+&#xE5; => 8c
+
+LATIN CAPITAL LETTER E WITH CIRCUMFLEX: &#xCA; => e6
+&#xE6; => be
+
+LATIN CAPITAL LETTER A WITH ACUTE: &#xC1; => e7
+&#xE7; => 8d
+
+LATIN CAPITAL LETTER E WITH DIAERESIS: &#xCB; => e8
+&#xE8; => 8f
+
+LATIN CAPITAL LETTER E WITH GRAVE: &#xC8; => e9
+&#xE9; => 8e
+
+LATIN CAPITAL LETTER I WITH ACUTE: &#xCD; => ea
+&#xEA; => 90
+
+LATIN CAPITAL LETTER I WITH CIRCUMFLEX: &#xCE; => eb
+&#xEB; => 91
+
+LATIN CAPITAL LETTER I WITH DIAERESIS: &#xCF; => ec
+&#xEC; => 93
+
+LATIN CAPITAL LETTER I WITH GRAVE: &#xCC; => ed
+&#xED; => 92
+
+LATIN CAPITAL LETTER O WITH ACUTE: &#xD3; => ee
+&#xEE; => 94
+
+LATIN CAPITAL LETTER O WITH CIRCUMFLEX: &#xD4; => ef
+&#xEF; => 95
+
+Apple logo: &#xF8FF; => f0
+&#xF0; => &#xF0;
+
+LATIN CAPITAL LETTER O WITH GRAVE: &#xD2; => f1
+&#xF1; => 96
+
+LATIN CAPITAL LETTER U WITH ACUTE: &#xDA; => f2
+&#xF2; => 98
+
+LATIN CAPITAL LETTER U WITH CIRCUMFLEX: &#xDB; => f3
+&#xF3; => 97
+
+LATIN CAPITAL LETTER U WITH GRAVE: &#xD9; => f4
+&#xF4; => 99
+
+LATIN SMALL LETTER DOTLESS I: &#x131; => f5
+&#xF5; => 9b
+
+MODIFIER LETTER CIRCUMFLEX ACCENT: &#x2C6; => f6
+&#xF6; => 9a
+
+SMALL TILDE: &#x2DC; => f7
+&#xF7; => d6
+
+MACRON: &#xAF; => f8
+&#xF8; => bf
+
+BREVE: &#x2D8; => f9
+&#xF9; => 9d
+
+DOT ABOVE: &#x2D9; => fa
+&#xFA; => 9c
+
+RING ABOVE: &#x2DA; => fb
+&#xFB; => 9e
+
+CEDILLA: &#xB8; => fc
+&#xFC; => 9f
+
+DOUBLE ACUTE ACCENT: &#x2DD; => fd
+&#xFD; => &#xFD;
+
+OGONEK: &#x2DB; => fe
+&#xFE; => &#xFE;
+
+CARON: &#x2C7; => ff
+&#xFF; => d8
+
+
diff --git a/ext/standard/tests/strings/html_entity_decode_win1251.phpt b/ext/standard/tests/strings/html_entity_decode_win1251.phpt
new file mode 100644
index 0000000000..e47392623c
--- /dev/null
+++ b/ext/standard/tests/strings/html_entity_decode_win1251.phpt
@@ -0,0 +1,537 @@
+--TEST--
+Translation of HTML entities for encoding WIN-1251
+--FILE--
+<?php
+$arr = array(
+0x0402 => array(0x80, "CYRILLIC CAPITAL LETTER DJE"),
+0x0403 => array(0x81, "CYRILLIC CAPITAL LETTER GJE"),
+0x201A => array(0x82, "SINGLE LOW-9 QUOTATION MARK"),
+0x0453 => array(0x83, "CYRILLIC SMALL LETTER GJE"),
+0x201E => array(0x84, "DOUBLE LOW-9 QUOTATION MARK"),
+0x2026 => array(0x85, "HORIZONTAL ELLIPSIS"),
+0x2020 => array(0x86, "DAGGER"),
+0x2021 => array(0x87, "DOUBLE DAGGER"),
+0x20AC => array(0x88, "EURO SIGN"),
+0x2030 => array(0x89, "PER MILLE SIGN"),
+0x0409 => array(0x8A, "CYRILLIC CAPITAL LETTER LJE"),
+0x2039 => array(0x8B, "SINGLE LEFT-POINTING ANGLE QUOTATION MARK"),
+0x040A => array(0x8C, "CYRILLIC CAPITAL LETTER NJE"),
+0x040C => array(0x8D, "CYRILLIC CAPITAL LETTER KJE"),
+0x040B => array(0x8E, "CYRILLIC CAPITAL LETTER TSHE"),
+0x040F => array(0x8F, "CYRILLIC CAPITAL LETTER DZHE"),
+0x0452 => array(0x90, "CYRILLIC SMALL LETTER DJE"),
+0x2018 => array(0x91, "LEFT SINGLE QUOTATION MARK"),
+0x2019 => array(0x92, "RIGHT SINGLE QUOTATION MARK"),
+0x201C => array(0x93, "LEFT DOUBLE QUOTATION MARK"),
+0x201D => array(0x94, "RIGHT DOUBLE QUOTATION MARK"),
+0x2022 => array(0x95, "BULLET"),
+0x2013 => array(0x96, "EN DASH"),
+0x2014 => array(0x97, "EM DASH"),
+//0x98 #UNDEFINED
+0x2122 => array(0x99, "TRADE MARK SIGN"),
+0x0459 => array(0x9A, "CYRILLIC SMALL LETTER LJE"),
+0x203A => array(0x9B, "SINGLE RIGHT-POINTING ANGLE QUOTATION MARK"),
+0x045A => array(0x9C, "CYRILLIC SMALL LETTER NJE"),
+0x045C => array(0x9D, "CYRILLIC SMALL LETTER KJE"),
+0x045B => array(0x9E, "CYRILLIC SMALL LETTER TSHE"),
+0x045F => array(0x9F, "CYRILLIC SMALL LETTER DZHE"),
+0x00A0 => array(0xA0, "NO-BREAK SPACE"),
+0x040E => array(0xA1, "CYRILLIC CAPITAL LETTER SHORT U"),
+0x045E => array(0xA2, "CYRILLIC SMALL LETTER SHORT U"),
+0x0408 => array(0xA3, "CYRILLIC CAPITAL LETTER JE"),
+0x00A4 => array(0xA4, "CURRENCY SIGN"),
+0x0490 => array(0xA5, "CYRILLIC CAPITAL LETTER GHE WITH UPTURN"),
+0x00A6 => array(0xA6, "BROKEN BAR"),
+0x00A7 => array(0xA7, "SECTION SIGN"),
+0x0401 => array(0xA8, "CYRILLIC CAPITAL LETTER IO"),
+0x00A9 => array(0xA9, "COPYRIGHT SIGN"),
+0x0404 => array(0xAA, "CYRILLIC CAPITAL LETTER UKRAINIAN IE"),
+0x00AB => array(0xAB, "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK"),
+0x00AC => array(0xAC, "NOT SIGN"),
+0x00AD => array(0xAD, "SOFT HYPHEN"),
+0x00AE => array(0xAE, "REGISTERED SIGN"),
+0x0407 => array(0xAF, "CYRILLIC CAPITAL LETTER YI"),
+0x00B0 => array(0xB0, "DEGREE SIGN"),
+0x00B1 => array(0xB1, "PLUS-MINUS SIGN"),
+0x0406 => array(0xB2, "CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I"),
+0x0456 => array(0xB3, "CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I"),
+0x0491 => array(0xB4, "CYRILLIC SMALL LETTER GHE WITH UPTURN"),
+0x00B5 => array(0xB5, "MICRO SIGN"),
+0x00B6 => array(0xB6, "PILCROW SIGN"),
+0x00B7 => array(0xB7, "MIDDLE DOT"),
+0x0451 => array(0xB8, "CYRILLIC SMALL LETTER IO"),
+0x2116 => array(0xB9, "NUMERO SIGN"),
+0x0454 => array(0xBA, "CYRILLIC SMALL LETTER UKRAINIAN IE"),
+0x00BB => array(0xBB, "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK"),
+0x0458 => array(0xBC, "CYRILLIC SMALL LETTER JE"),
+0x0405 => array(0xBD, "CYRILLIC CAPITAL LETTER DZE"),
+0x0455 => array(0xBE, "CYRILLIC SMALL LETTER DZE"),
+0x0457 => array(0xBF, "CYRILLIC SMALL LETTER YI"),
+0x0410 => array(0xC0, "CYRILLIC CAPITAL LETTER A"),
+0x0411 => array(0xC1, "CYRILLIC CAPITAL LETTER BE"),
+0x0412 => array(0xC2, "CYRILLIC CAPITAL LETTER VE"),
+0x0413 => array(0xC3, "CYRILLIC CAPITAL LETTER GHE"),
+0x0414 => array(0xC4, "CYRILLIC CAPITAL LETTER DE"),
+0x0415 => array(0xC5, "CYRILLIC CAPITAL LETTER IE"),
+0x0416 => array(0xC6, "CYRILLIC CAPITAL LETTER ZHE"),
+0x0417 => array(0xC7, "CYRILLIC CAPITAL LETTER ZE"),
+0x0418 => array(0xC8, "CYRILLIC CAPITAL LETTER I"),
+0x0419 => array(0xC9, "CYRILLIC CAPITAL LETTER SHORT I"),
+0x041A => array(0xCA, "CYRILLIC CAPITAL LETTER KA"),
+0x041B => array(0xCB, "CYRILLIC CAPITAL LETTER EL"),
+0x041C => array(0xCC, "CYRILLIC CAPITAL LETTER EM"),
+0x041D => array(0xCD, "CYRILLIC CAPITAL LETTER EN"),
+0x041E => array(0xCE, "CYRILLIC CAPITAL LETTER O"),
+0x041F => array(0xCF, "CYRILLIC CAPITAL LETTER PE"),
+0x0420 => array(0xD0, "CYRILLIC CAPITAL LETTER ER"),
+0x0421 => array(0xD1, "CYRILLIC CAPITAL LETTER ES"),
+0x0422 => array(0xD2, "CYRILLIC CAPITAL LETTER TE"),
+0x0423 => array(0xD3, "CYRILLIC CAPITAL LETTER U"),
+0x0424 => array(0xD4, "CYRILLIC CAPITAL LETTER EF"),
+0x0425 => array(0xD5, "CYRILLIC CAPITAL LETTER HA"),
+0x0426 => array(0xD6, "CYRILLIC CAPITAL LETTER TSE"),
+0x0427 => array(0xD7, "CYRILLIC CAPITAL LETTER CHE"),
+0x0428 => array(0xD8, "CYRILLIC CAPITAL LETTER SHA"),
+0x0429 => array(0xD9, "CYRILLIC CAPITAL LETTER SHCHA"),
+0x042A => array(0xDA, "CYRILLIC CAPITAL LETTER HARD SIGN"),
+0x042B => array(0xDB, "CYRILLIC CAPITAL LETTER YERU"),
+0x042C => array(0xDC, "CYRILLIC CAPITAL LETTER SOFT SIGN"),
+0x042D => array(0xDD, "CYRILLIC CAPITAL LETTER E"),
+0x042E => array(0xDE, "CYRILLIC CAPITAL LETTER YU"),
+0x042F => array(0xDF, "CYRILLIC CAPITAL LETTER YA"),
+0x0430 => array(0xE0, "CYRILLIC SMALL LETTER A"),
+0x0431 => array(0xE1, "CYRILLIC SMALL LETTER BE"),
+0x0432 => array(0xE2, "CYRILLIC SMALL LETTER VE"),
+0x0433 => array(0xE3, "CYRILLIC SMALL LETTER GHE"),
+0x0434 => array(0xE4, "CYRILLIC SMALL LETTER DE"),
+0x0435 => array(0xE5, "CYRILLIC SMALL LETTER IE"),
+0x0436 => array(0xE6, "CYRILLIC SMALL LETTER ZHE"),
+0x0437 => array(0xE7, "CYRILLIC SMALL LETTER ZE"),
+0x0438 => array(0xE8, "CYRILLIC SMALL LETTER I"),
+0x0439 => array(0xE9, "CYRILLIC SMALL LETTER SHORT I"),
+0x043A => array(0xEA, "CYRILLIC SMALL LETTER KA"),
+0x043B => array(0xEB, "CYRILLIC SMALL LETTER EL"),
+0x043C => array(0xEC, "CYRILLIC SMALL LETTER EM"),
+0x043D => array(0xED, "CYRILLIC SMALL LETTER EN"),
+0x043E => array(0xEE, "CYRILLIC SMALL LETTER O"),
+0x043F => array(0xEF, "CYRILLIC SMALL LETTER PE"),
+0x0440 => array(0xF0, "CYRILLIC SMALL LETTER ER"),
+0x0441 => array(0xF1, "CYRILLIC SMALL LETTER ES"),
+0x0442 => array(0xF2, "CYRILLIC SMALL LETTER TE"),
+0x0443 => array(0xF3, "CYRILLIC SMALL LETTER U"),
+0x0444 => array(0xF4, "CYRILLIC SMALL LETTER EF"),
+0x0445 => array(0xF5, "CYRILLIC SMALL LETTER HA"),
+0x0446 => array(0xF6, "CYRILLIC SMALL LETTER TSE"),
+0x0447 => array(0xF7, "CYRILLIC SMALL LETTER CHE"),
+0x0448 => array(0xF8, "CYRILLIC SMALL LETTER SHA"),
+0x0449 => array(0xF9, "CYRILLIC SMALL LETTER SHCHA"),
+0x044A => array(0xFA, "CYRILLIC SMALL LETTER HARD SIGN"),
+0x044B => array(0xFB, "CYRILLIC SMALL LETTER YERU"),
+0x044C => array(0xFC, "CYRILLIC SMALL LETTER SOFT SIGN"),
+0x044D => array(0xFD, "CYRILLIC SMALL LETTER E"),
+0x044E => array(0xFE, "CYRILLIC SMALL LETTER YU"),
+0x044F => array(0xFF, "CYRILLIC SMALL LETTER YA"),
+);
+
+$res = html_entity_decode("&#x98;", ENT_QUOTES, 'WINDOWS-1251');
+echo "Special test for &#x98; (shouldn't decode):\n";
+echo $res,"\n\n";
+
+foreach ($arr as $u => $v) {
+ $ent = sprintf("&#x%X;", $u);
+ $res = html_entity_decode($ent, ENT_QUOTES, 'WINDOWS-1251');
+ $d = unpack("H*", $res);
+ echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]);
+
+ $ent = sprintf("&#x%X;", $v[0]);
+ $res = html_entity_decode($ent, ENT_QUOTES, 'WINDOWS-1251');
+ if ($res[0] != "&" || $res[1] != "#")
+ $res = unpack("H*", $res)[1];
+ echo sprintf("%s => %s\n\n", $ent, $res);
+}
+--EXPECT--
+Special test for &#x98; (shouldn't decode):
+&#x98;
+
+CYRILLIC CAPITAL LETTER DJE: &#x402; => 80
+&#x80; => &#x80;
+
+CYRILLIC CAPITAL LETTER GJE: &#x403; => 81
+&#x81; => &#x81;
+
+SINGLE LOW-9 QUOTATION MARK: &#x201A; => 82
+&#x82; => &#x82;
+
+CYRILLIC SMALL LETTER GJE: &#x453; => 83
+&#x83; => &#x83;
+
+DOUBLE LOW-9 QUOTATION MARK: &#x201E; => 84
+&#x84; => &#x84;
+
+HORIZONTAL ELLIPSIS: &#x2026; => 85
+&#x85; => &#x85;
+
+DAGGER: &#x2020; => 86
+&#x86; => &#x86;
+
+DOUBLE DAGGER: &#x2021; => 87
+&#x87; => &#x87;
+
+EURO SIGN: &#x20AC; => 88
+&#x88; => &#x88;
+
+PER MILLE SIGN: &#x2030; => 89
+&#x89; => &#x89;
+
+CYRILLIC CAPITAL LETTER LJE: &#x409; => 8a
+&#x8A; => &#x8A;
+
+SINGLE LEFT-POINTING ANGLE QUOTATION MARK: &#x2039; => 8b
+&#x8B; => &#x8B;
+
+CYRILLIC CAPITAL LETTER NJE: &#x40A; => 8c
+&#x8C; => &#x8C;
+
+CYRILLIC CAPITAL LETTER KJE: &#x40C; => 8d
+&#x8D; => &#x8D;
+
+CYRILLIC CAPITAL LETTER TSHE: &#x40B; => 8e
+&#x8E; => &#x8E;
+
+CYRILLIC CAPITAL LETTER DZHE: &#x40F; => 8f
+&#x8F; => &#x8F;
+
+CYRILLIC SMALL LETTER DJE: &#x452; => 90
+&#x90; => &#x90;
+
+LEFT SINGLE QUOTATION MARK: &#x2018; => 91
+&#x91; => &#x91;
+
+RIGHT SINGLE QUOTATION MARK: &#x2019; => 92
+&#x92; => &#x92;
+
+LEFT DOUBLE QUOTATION MARK: &#x201C; => 93
+&#x93; => &#x93;
+
+RIGHT DOUBLE QUOTATION MARK: &#x201D; => 94
+&#x94; => &#x94;
+
+BULLET: &#x2022; => 95
+&#x95; => &#x95;
+
+EN DASH: &#x2013; => 96
+&#x96; => &#x96;
+
+EM DASH: &#x2014; => 97
+&#x97; => &#x97;
+
+TRADE MARK SIGN: &#x2122; => 99
+&#x99; => &#x99;
+
+CYRILLIC SMALL LETTER LJE: &#x459; => 9a
+&#x9A; => &#x9A;
+
+SINGLE RIGHT-POINTING ANGLE QUOTATION MARK: &#x203A; => 9b
+&#x9B; => &#x9B;
+
+CYRILLIC SMALL LETTER NJE: &#x45A; => 9c
+&#x9C; => &#x9C;
+
+CYRILLIC SMALL LETTER KJE: &#x45C; => 9d
+&#x9D; => &#x9D;
+
+CYRILLIC SMALL LETTER TSHE: &#x45B; => 9e
+&#x9E; => &#x9E;
+
+CYRILLIC SMALL LETTER DZHE: &#x45F; => 9f
+&#x9F; => &#x9F;
+
+NO-BREAK SPACE: &#xA0; => a0
+&#xA0; => a0
+
+CYRILLIC CAPITAL LETTER SHORT U: &#x40E; => a1
+&#xA1; => &#xA1;
+
+CYRILLIC SMALL LETTER SHORT U: &#x45E; => a2
+&#xA2; => &#xA2;
+
+CYRILLIC CAPITAL LETTER JE: &#x408; => a3
+&#xA3; => &#xA3;
+
+CURRENCY SIGN: &#xA4; => a4
+&#xA4; => a4
+
+CYRILLIC CAPITAL LETTER GHE WITH UPTURN: &#x490; => a5
+&#xA5; => &#xA5;
+
+BROKEN BAR: &#xA6; => a6
+&#xA6; => a6
+
+SECTION SIGN: &#xA7; => a7
+&#xA7; => a7
+
+CYRILLIC CAPITAL LETTER IO: &#x401; => a8
+&#xA8; => &#xA8;
+
+COPYRIGHT SIGN: &#xA9; => a9
+&#xA9; => a9
+
+CYRILLIC CAPITAL LETTER UKRAINIAN IE: &#x404; => aa
+&#xAA; => &#xAA;
+
+LEFT-POINTING DOUBLE ANGLE QUOTATION MARK: &#xAB; => ab
+&#xAB; => ab
+
+NOT SIGN: &#xAC; => ac
+&#xAC; => ac
+
+SOFT HYPHEN: &#xAD; => ad
+&#xAD; => ad
+
+REGISTERED SIGN: &#xAE; => ae
+&#xAE; => ae
+
+CYRILLIC CAPITAL LETTER YI: &#x407; => af
+&#xAF; => &#xAF;
+
+DEGREE SIGN: &#xB0; => b0
+&#xB0; => b0
+
+PLUS-MINUS SIGN: &#xB1; => b1
+&#xB1; => b1
+
+CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I: &#x406; => b2
+&#xB2; => &#xB2;
+
+CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I: &#x456; => b3
+&#xB3; => &#xB3;
+
+CYRILLIC SMALL LETTER GHE WITH UPTURN: &#x491; => b4
+&#xB4; => &#xB4;
+
+MICRO SIGN: &#xB5; => b5
+&#xB5; => b5
+
+PILCROW SIGN: &#xB6; => b6
+&#xB6; => b6
+
+MIDDLE DOT: &#xB7; => b7
+&#xB7; => b7
+
+CYRILLIC SMALL LETTER IO: &#x451; => b8
+&#xB8; => &#xB8;
+
+NUMERO SIGN: &#x2116; => b9
+&#xB9; => &#xB9;
+
+CYRILLIC SMALL LETTER UKRAINIAN IE: &#x454; => ba
+&#xBA; => &#xBA;
+
+RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK: &#xBB; => bb
+&#xBB; => bb
+
+CYRILLIC SMALL LETTER JE: &#x458; => bc
+&#xBC; => &#xBC;
+
+CYRILLIC CAPITAL LETTER DZE: &#x405; => bd
+&#xBD; => &#xBD;
+
+CYRILLIC SMALL LETTER DZE: &#x455; => be
+&#xBE; => &#xBE;
+
+CYRILLIC SMALL LETTER YI: &#x457; => bf
+&#xBF; => &#xBF;
+
+CYRILLIC CAPITAL LETTER A: &#x410; => c0
+&#xC0; => &#xC0;
+
+CYRILLIC CAPITAL LETTER BE: &#x411; => c1
+&#xC1; => &#xC1;
+
+CYRILLIC CAPITAL LETTER VE: &#x412; => c2
+&#xC2; => &#xC2;
+
+CYRILLIC CAPITAL LETTER GHE: &#x413; => c3
+&#xC3; => &#xC3;
+
+CYRILLIC CAPITAL LETTER DE: &#x414; => c4
+&#xC4; => &#xC4;
+
+CYRILLIC CAPITAL LETTER IE: &#x415; => c5
+&#xC5; => &#xC5;
+
+CYRILLIC CAPITAL LETTER ZHE: &#x416; => c6
+&#xC6; => &#xC6;
+
+CYRILLIC CAPITAL LETTER ZE: &#x417; => c7
+&#xC7; => &#xC7;
+
+CYRILLIC CAPITAL LETTER I: &#x418; => c8
+&#xC8; => &#xC8;
+
+CYRILLIC CAPITAL LETTER SHORT I: &#x419; => c9
+&#xC9; => &#xC9;
+
+CYRILLIC CAPITAL LETTER KA: &#x41A; => ca
+&#xCA; => &#xCA;
+
+CYRILLIC CAPITAL LETTER EL: &#x41B; => cb
+&#xCB; => &#xCB;
+
+CYRILLIC CAPITAL LETTER EM: &#x41C; => cc
+&#xCC; => &#xCC;
+
+CYRILLIC CAPITAL LETTER EN: &#x41D; => cd
+&#xCD; => &#xCD;
+
+CYRILLIC CAPITAL LETTER O: &#x41E; => ce
+&#xCE; => &#xCE;
+
+CYRILLIC CAPITAL LETTER PE: &#x41F; => cf
+&#xCF; => &#xCF;
+
+CYRILLIC CAPITAL LETTER ER: &#x420; => d0
+&#xD0; => &#xD0;
+
+CYRILLIC CAPITAL LETTER ES: &#x421; => d1
+&#xD1; => &#xD1;
+
+CYRILLIC CAPITAL LETTER TE: &#x422; => d2
+&#xD2; => &#xD2;
+
+CYRILLIC CAPITAL LETTER U: &#x423; => d3
+&#xD3; => &#xD3;
+
+CYRILLIC CAPITAL LETTER EF: &#x424; => d4
+&#xD4; => &#xD4;
+
+CYRILLIC CAPITAL LETTER HA: &#x425; => d5
+&#xD5; => &#xD5;
+
+CYRILLIC CAPITAL LETTER TSE: &#x426; => d6
+&#xD6; => &#xD6;
+
+CYRILLIC CAPITAL LETTER CHE: &#x427; => d7
+&#xD7; => &#xD7;
+
+CYRILLIC CAPITAL LETTER SHA: &#x428; => d8
+&#xD8; => &#xD8;
+
+CYRILLIC CAPITAL LETTER SHCHA: &#x429; => d9
+&#xD9; => &#xD9;
+
+CYRILLIC CAPITAL LETTER HARD SIGN: &#x42A; => da
+&#xDA; => &#xDA;
+
+CYRILLIC CAPITAL LETTER YERU: &#x42B; => db
+&#xDB; => &#xDB;
+
+CYRILLIC CAPITAL LETTER SOFT SIGN: &#x42C; => dc
+&#xDC; => &#xDC;
+
+CYRILLIC CAPITAL LETTER E: &#x42D; => dd
+&#xDD; => &#xDD;
+
+CYRILLIC CAPITAL LETTER YU: &#x42E; => de
+&#xDE; => &#xDE;
+
+CYRILLIC CAPITAL LETTER YA: &#x42F; => df
+&#xDF; => &#xDF;
+
+CYRILLIC SMALL LETTER A: &#x430; => e0
+&#xE0; => &#xE0;
+
+CYRILLIC SMALL LETTER BE: &#x431; => e1
+&#xE1; => &#xE1;
+
+CYRILLIC SMALL LETTER VE: &#x432; => e2
+&#xE2; => &#xE2;
+
+CYRILLIC SMALL LETTER GHE: &#x433; => e3
+&#xE3; => &#xE3;
+
+CYRILLIC SMALL LETTER DE: &#x434; => e4
+&#xE4; => &#xE4;
+
+CYRILLIC SMALL LETTER IE: &#x435; => e5
+&#xE5; => &#xE5;
+
+CYRILLIC SMALL LETTER ZHE: &#x436; => e6
+&#xE6; => &#xE6;
+
+CYRILLIC SMALL LETTER ZE: &#x437; => e7
+&#xE7; => &#xE7;
+
+CYRILLIC SMALL LETTER I: &#x438; => e8
+&#xE8; => &#xE8;
+
+CYRILLIC SMALL LETTER SHORT I: &#x439; => e9
+&#xE9; => &#xE9;
+
+CYRILLIC SMALL LETTER KA: &#x43A; => ea
+&#xEA; => &#xEA;
+
+CYRILLIC SMALL LETTER EL: &#x43B; => eb
+&#xEB; => &#xEB;
+
+CYRILLIC SMALL LETTER EM: &#x43C; => ec
+&#xEC; => &#xEC;
+
+CYRILLIC SMALL LETTER EN: &#x43D; => ed
+&#xED; => &#xED;
+
+CYRILLIC SMALL LETTER O: &#x43E; => ee
+&#xEE; => &#xEE;
+
+CYRILLIC SMALL LETTER PE: &#x43F; => ef
+&#xEF; => &#xEF;
+
+CYRILLIC SMALL LETTER ER: &#x440; => f0
+&#xF0; => &#xF0;
+
+CYRILLIC SMALL LETTER ES: &#x441; => f1
+&#xF1; => &#xF1;
+
+CYRILLIC SMALL LETTER TE: &#x442; => f2
+&#xF2; => &#xF2;
+
+CYRILLIC SMALL LETTER U: &#x443; => f3
+&#xF3; => &#xF3;
+
+CYRILLIC SMALL LETTER EF: &#x444; => f4
+&#xF4; => &#xF4;
+
+CYRILLIC SMALL LETTER HA: &#x445; => f5
+&#xF5; => &#xF5;
+
+CYRILLIC SMALL LETTER TSE: &#x446; => f6
+&#xF6; => &#xF6;
+
+CYRILLIC SMALL LETTER CHE: &#x447; => f7
+&#xF7; => &#xF7;
+
+CYRILLIC SMALL LETTER SHA: &#x448; => f8
+&#xF8; => &#xF8;
+
+CYRILLIC SMALL LETTER SHCHA: &#x449; => f9
+&#xF9; => &#xF9;
+
+CYRILLIC SMALL LETTER HARD SIGN: &#x44A; => fa
+&#xFA; => &#xFA;
+
+CYRILLIC SMALL LETTER YERU: &#x44B; => fb
+&#xFB; => &#xFB;
+
+CYRILLIC SMALL LETTER SOFT SIGN: &#x44C; => fc
+&#xFC; => &#xFC;
+
+CYRILLIC SMALL LETTER E: &#x44D; => fd
+&#xFD; => &#xFD;
+
+CYRILLIC SMALL LETTER YU: &#x44E; => fe
+&#xFE; => &#xFE;
+
+CYRILLIC SMALL LETTER YA: &#x44F; => ff
+&#xFF; => &#xFF;
+
+
diff --git a/ext/standard/tests/strings/html_entity_decode_win1252.phpt b/ext/standard/tests/strings/html_entity_decode_win1252.phpt
new file mode 100644
index 0000000000..2a7a6981dc
--- /dev/null
+++ b/ext/standard/tests/strings/html_entity_decode_win1252.phpt
@@ -0,0 +1,169 @@
+--TEST--
+Translation of HTML entities for encoding WIN-1252
+--FILE--
+<?php
+$arr = array(
+0x20AC => array(0x80, "EURO SIGN"),
+//0x81 #UNDEFINED
+0x201A => array(0x82, "SINGLE LOW-9 QUOTATION MARK"),
+0x0192 => array(0x83, "LATIN SMALL LETTER F WITH HOOK"),
+0x201E => array(0x84, "DOUBLE LOW-9 QUOTATION MARK"),
+0x2026 => array(0x85, "HORIZONTAL ELLIPSIS"),
+0x2020 => array(0x86, "DAGGER"),
+0x2021 => array(0x87, "DOUBLE DAGGER"),
+0x02C6 => array(0x88, "MODIFIER LETTER CIRCUMFLEX ACCENT"),
+0x2030 => array(0x89, "PER MILLE SIGN"),
+0x0160 => array(0x8A, "LATIN CAPITAL LETTER S WITH CARON"),
+0x2039 => array(0x8B, "SINGLE LEFT-POINTING ANGLE QUOTATION MARK"),
+0x0152 => array(0x8C, "LATIN CAPITAL LIGATURE OE"),
+//0x8D #UNDEFINED
+0x017D => array(0x8E, "LATIN CAPITAL LETTER Z WITH CARON"),
+//0x8F #UNDEFINED
+//0x90 #UNDEFINED
+0x2018 => array(0x91, "LEFT SINGLE QUOTATION MARK"),
+0x2019 => array(0x92, "RIGHT SINGLE QUOTATION MARK"),
+0x201C => array(0x93, "LEFT DOUBLE QUOTATION MARK"),
+0x201D => array(0x94, "RIGHT DOUBLE QUOTATION MARK"),
+0x2022 => array(0x95, "BULLET"),
+0x2013 => array(0x96, "EN DASH"),
+0x2014 => array(0x97, "EM DASH"),
+0x02DC => array(0x98, "SMALL TILDE"),
+0x2122 => array(0x99, "TRADE MARK SIGN"),
+0x0161 => array(0x9A, "LATIN SMALL LETTER S WITH CARON"),
+0x203A => array(0x9B, "SINGLE RIGHT-POINTING ANGLE QUOTATION MARK"),
+0x0153 => array(0x9C, "LATIN SMALL LIGATURE OE"),
+//0x9D #UNDEFINED
+0x017E => array(0x9E, "LATIN SMALL LETTER Z WITH CARON"),
+0x0178 => array(0x9F, "LATIN CAPITAL LETTER Y WITH DIAERESIS"),
+);
+
+$res = html_entity_decode("&#x81;", ENT_QUOTES, 'WINDOWS-1252');
+echo "Special test for &#x81; (shouldn't decode):\n";
+echo $res,"\n\n";
+
+$res = html_entity_decode("&#x8D;", ENT_QUOTES, 'WINDOWS-1252');
+echo "Special test for &#x8D; (shouldn't decode):\n";
+echo $res,"\n\n";
+
+$res = html_entity_decode("&#x8F;", ENT_QUOTES, 'WINDOWS-1252');
+echo "Special test for &#x8F; (shouldn't decode):\n";
+echo $res,"\n\n";
+
+$res = html_entity_decode("&#x90;", ENT_QUOTES, 'WINDOWS-1252');
+echo "Special test for &#x90; (shouldn't decode):\n";
+echo $res,"\n\n";
+
+$res = html_entity_decode("&#x9D;", ENT_QUOTES, 'WINDOWS-1252');
+echo "Special test for &#x9D; (shouldn't decode):\n";
+echo $res,"\n\n";
+
+foreach ($arr as $u => $v) {
+ $ent = sprintf("&#x%X;", $u);
+ $res = html_entity_decode($ent, ENT_QUOTES, 'WINDOWS-1252');
+ $d = unpack("H*", $res);
+ echo sprintf("%s: %s => %s\n", $v[1], $ent, $d[1]);
+
+ $ent = sprintf("&#x%X;", $v[0]);
+ $res = html_entity_decode($ent, ENT_QUOTES, 'WINDOWS-1252');
+ if ($res[0] != "&" || $res[1] != "#")
+ $res = unpack("H*", $res)[1];
+ echo sprintf("%s => %s\n\n", $ent, $res);
+}
+--EXPECT--
+Special test for &#x81; (shouldn't decode):
+&#x81;
+
+Special test for &#x8D; (shouldn't decode):
+&#x8D;
+
+Special test for &#x8F; (shouldn't decode):
+&#x8F;
+
+Special test for &#x90; (shouldn't decode):
+&#x90;
+
+Special test for &#x9D; (shouldn't decode):
+&#x9D;
+
+EURO SIGN: &#x20AC; => 80
+&#x80; => &#x80;
+
+SINGLE LOW-9 QUOTATION MARK: &#x201A; => 82
+&#x82; => &#x82;
+
+LATIN SMALL LETTER F WITH HOOK: &#x192; => 83
+&#x83; => &#x83;
+
+DOUBLE LOW-9 QUOTATION MARK: &#x201E; => 84
+&#x84; => &#x84;
+
+HORIZONTAL ELLIPSIS: &#x2026; => 85
+&#x85; => &#x85;
+
+DAGGER: &#x2020; => 86
+&#x86; => &#x86;
+
+DOUBLE DAGGER: &#x2021; => 87
+&#x87; => &#x87;
+
+MODIFIER LETTER CIRCUMFLEX ACCENT: &#x2C6; => 88
+&#x88; => &#x88;
+
+PER MILLE SIGN: &#x2030; => 89
+&#x89; => &#x89;
+
+LATIN CAPITAL LETTER S WITH CARON: &#x160; => 8a
+&#x8A; => &#x8A;
+
+SINGLE LEFT-POINTING ANGLE QUOTATION MARK: &#x2039; => 8b
+&#x8B; => &#x8B;
+
+LATIN CAPITAL LIGATURE OE: &#x152; => 8c
+&#x8C; => &#x8C;
+
+LATIN CAPITAL LETTER Z WITH CARON: &#x17D; => 8e
+&#x8E; => &#x8E;
+
+LEFT SINGLE QUOTATION MARK: &#x2018; => 91
+&#x91; => &#x91;
+
+RIGHT SINGLE QUOTATION MARK: &#x2019; => 92
+&#x92; => &#x92;
+
+LEFT DOUBLE QUOTATION MARK: &#x201C; => 93
+&#x93; => &#x93;
+
+RIGHT DOUBLE QUOTATION MARK: &#x201D; => 94
+&#x94; => &#x94;
+
+BULLET: &#x2022; => 95
+&#x95; => &#x95;
+
+EN DASH: &#x2013; => 96
+&#x96; => &#x96;
+
+EM DASH: &#x2014; => 97
+&#x97; => &#x97;
+
+SMALL TILDE: &#x2DC; => 98
+&#x98; => &#x98;
+
+TRADE MARK SIGN: &#x2122; => 99
+&#x99; => &#x99;
+
+LATIN SMALL LETTER S WITH CARON: &#x161; => 9a
+&#x9A; => &#x9A;
+
+SINGLE RIGHT-POINTING ANGLE QUOTATION MARK: &#x203A; => 9b
+&#x9B; => &#x9B;
+
+LATIN SMALL LIGATURE OE: &#x153; => 9c
+&#x9C; => &#x9C;
+
+LATIN SMALL LETTER Z WITH CARON: &#x17E; => 9e
+&#x9E; => &#x9E;
+
+LATIN CAPITAL LETTER Y WITH DIAERESIS: &#x178; => 9f
+&#x9F; => &#x9F;
+
+
diff --git a/ext/standard/tests/strings/htmlentities17.phpt b/ext/standard/tests/strings/htmlentities17.phpt
index b203e7c3e0..d9e67a9b87 100644
--- a/ext/standard/tests/strings/htmlentities17.phpt
+++ b/ext/standard/tests/strings/htmlentities17.phpt
@@ -3,7 +3,6 @@ htmlentities() / html_entity_decode() #8592 - #9002 table test
--FILE--
<?php
$tests = array(
- array(8768, '&wreath;', "e28980"),
array(8853, '&oplus;', "e28a95"),
array(8855, '&otimes;', "e28a97"),
array(8869, '&perp;', "e28aa5"),
@@ -26,7 +25,6 @@ foreach ($tests as $test) {
}
?>
--EXPECT--
-string(8) "&wreath;"
string(7) "&oplus;"
string(8) "&otimes;"
string(6) "&perp;"
@@ -37,7 +35,6 @@ string(8) "&lfloor;"
string(8) "&rfloor;"
string(6) "&lang;"
string(6) "&rang;"
-string(6) "e28980"
string(6) "e28a95"
string(6) "e28a97"
string(6) "e28aa5"