1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
|
use HTML::Entities qw(decode_entities encode_entities encode_entities_numeric);
use Test::More tests => 20;
$a = "Våre norske tegn bør æres";
decode_entities($a);
is($a, "Våre norske tegn bør æres");
encode_entities($a);
is($a, "Våre norske tegn bør æres");
decode_entities($a);
encode_entities_numeric($a);
is($a, "Våre norske tegn bør æres");
$a = "<&>\"'";
is(encode_entities($a), "<&>"'");
is(encode_entities_numeric($a), "<&>"'");
$a = "abcdef";
is(encode_entities($a, 'a-c'), "abcdef");
$a = "[24/7]\\";
is(encode_entities($a, '/'), "[24/7]\\");
is(encode_entities($a, '\\/'), "[24/7]\\");
is(encode_entities($a, '\\'), "[24/7]\");
is(encode_entities($a, ']\\'), "[24/7]\");
# See how well it does against rfc1866...
$ent = $plain = "";
while (<DATA>) {
next unless /^\s*<!ENTITY\s+(\w+)\s*CDATA\s*\"&\#(\d+)/;
$ent .= "&$1;";
$plain .= chr($2);
}
$a = $ent;
decode_entities($a);
is($a, $plain);
# Try decoding when the ";" are left out
$a = $ent,
$a =~ s/;//g;
decode_entities($a);
is($a, $plain);
$a = $plain;
encode_entities($a);
is($a, $ent);
{ #RT #84144 - https://rt.cpan.org/Public/Bug/Display.html?id=84144
my %hash= (
"Våre norske tegn bør æres" => "Våre norske tegn bør æres"
);
my ($got, $eval_ok);
$eval_ok= eval { $got= decode_entities((keys %hash)[0]); 1 };
is( $eval_ok, 1, "decode_entitites() when processing a key as input");
is( $got, (values %hash)[0], "decode_entities() decodes a key properly");
}
# From: Bill Simpson-Young <bill.simpson-young@cmis.csiro.au>
# Subject: HTML entities problem with 5.11
# To: libwww-perl@ics.uci.edu
# Date: Fri, 05 Sep 1997 16:56:55 +1000
# Message-Id: <199709050657.QAA10089@snowy.nsw.cmis.CSIRO.AU>
#
# Hi. I've got a problem that has surfaced with the changes to
# HTML::Entities.pm for 5.11 (it doesn't happen with 5.08). It's happening
# in the process of encoding then decoding special entities. Eg, what goes
# in as "abc&def&ghi" comes out as "abc&def;&ghi;".
is(decode_entities("abc&def&ghi&abc;&def;"), "abc&def&ghi&abc;&def;");
# Decoding of '
is(decode_entities("'"), "'");
is(encode_entities("'", "'"), "'");
is(decode_entities("Attention Homeοωnөrs...1ѕt Tімe Eνөг"),
"Attention Home\x{3BF}\x{3C9}n\x{4E9}rs...1\x{455}t T\x{456}\x{43C}e E\x{3BD}\x{4E9}\x{433}");
is(decode_entities("{&amp;&amp;& also Яœ}"),
"{&&& also \x{42F}\x{153}}");
__END__
# Quoted from rfc1866.txt
14. Proposed Entities
The HTML DTD references the "Added Latin 1" entity set, which only
supplies named entities for a subset of the non-ASCII characters in
[ISO-8859-1], namely the accented characters. The following entities
should be supported so that all ISO 8859-1 characters may only be
referenced symbolically. The names for these entities are taken from
the appendixes of [SGML].
<!ENTITY nbsp CDATA " " -- no-break space -->
<!ENTITY iexcl CDATA "¡" -- inverted exclamation mark -->
<!ENTITY cent CDATA "¢" -- cent sign -->
<!ENTITY pound CDATA "£" -- pound sterling sign -->
<!ENTITY curren CDATA "¤" -- general currency sign -->
<!ENTITY yen CDATA "¥" -- yen sign -->
<!ENTITY brvbar CDATA "¦" -- broken (vertical) bar -->
<!ENTITY sect CDATA "§" -- section sign -->
<!ENTITY uml CDATA "¨" -- umlaut (dieresis) -->
<!ENTITY copy CDATA "©" -- copyright sign -->
<!ENTITY ordf CDATA "ª" -- ordinal indicator, feminine -->
<!ENTITY laquo CDATA "«" -- angle quotation mark, left -->
<!ENTITY not CDATA "¬" -- not sign -->
<!ENTITY shy CDATA "­" -- soft hyphen -->
<!ENTITY reg CDATA "®" -- registered sign -->
<!ENTITY macr CDATA "¯" -- macron -->
<!ENTITY deg CDATA "°" -- degree sign -->
<!ENTITY plusmn CDATA "±" -- plus-or-minus sign -->
<!ENTITY sup2 CDATA "²" -- superscript two -->
<!ENTITY sup3 CDATA "³" -- superscript three -->
<!ENTITY acute CDATA "´" -- acute accent -->
<!ENTITY micro CDATA "µ" -- micro sign -->
<!ENTITY para CDATA "¶" -- pilcrow (paragraph sign) -->
<!ENTITY middot CDATA "·" -- middle dot -->
<!ENTITY cedil CDATA "¸" -- cedilla -->
<!ENTITY sup1 CDATA "¹" -- superscript one -->
<!ENTITY ordm CDATA "º" -- ordinal indicator, masculine -->
<!ENTITY raquo CDATA "»" -- angle quotation mark, right -->
<!ENTITY frac14 CDATA "¼" -- fraction one-quarter -->
<!ENTITY frac12 CDATA "½" -- fraction one-half -->
<!ENTITY frac34 CDATA "¾" -- fraction three-quarters -->
<!ENTITY iquest CDATA "¿" -- inverted question mark -->
<!ENTITY Agrave CDATA "À" -- capital A, grave accent -->
<!ENTITY Aacute CDATA "Á" -- capital A, acute accent -->
<!ENTITY Acirc CDATA "Â" -- capital A, circumflex accent -->
Berners-Lee & Connolly Standards Track [Page 75]
RFC 1866 Hypertext Markup Language - 2.0 November 1995
<!ENTITY Atilde CDATA "Ã" -- capital A, tilde -->
<!ENTITY Auml CDATA "Ä" -- capital A, dieresis or umlaut mark -->
<!ENTITY Aring CDATA "Å" -- capital A, ring -->
<!ENTITY AElig CDATA "Æ" -- capital AE diphthong (ligature) -->
<!ENTITY Ccedil CDATA "Ç" -- capital C, cedilla -->
<!ENTITY Egrave CDATA "È" -- capital E, grave accent -->
<!ENTITY Eacute CDATA "É" -- capital E, acute accent -->
<!ENTITY Ecirc CDATA "Ê" -- capital E, circumflex accent -->
<!ENTITY Euml CDATA "Ë" -- capital E, dieresis or umlaut mark -->
<!ENTITY Igrave CDATA "Ì" -- capital I, grave accent -->
<!ENTITY Iacute CDATA "Í" -- capital I, acute accent -->
<!ENTITY Icirc CDATA "Î" -- capital I, circumflex accent -->
<!ENTITY Iuml CDATA "Ï" -- capital I, dieresis or umlaut mark -->
<!ENTITY ETH CDATA "Ð" -- capital Eth, Icelandic -->
<!ENTITY Ntilde CDATA "Ñ" -- capital N, tilde -->
<!ENTITY Ograve CDATA "Ò" -- capital O, grave accent -->
<!ENTITY Oacute CDATA "Ó" -- capital O, acute accent -->
<!ENTITY Ocirc CDATA "Ô" -- capital O, circumflex accent -->
<!ENTITY Otilde CDATA "Õ" -- capital O, tilde -->
<!ENTITY Ouml CDATA "Ö" -- capital O, dieresis or umlaut mark -->
<!ENTITY times CDATA "×" -- multiply sign -->
<!ENTITY Oslash CDATA "Ø" -- capital O, slash -->
<!ENTITY Ugrave CDATA "Ù" -- capital U, grave accent -->
<!ENTITY Uacute CDATA "Ú" -- capital U, acute accent -->
<!ENTITY Ucirc CDATA "Û" -- capital U, circumflex accent -->
<!ENTITY Uuml CDATA "Ü" -- capital U, dieresis or umlaut mark -->
<!ENTITY Yacute CDATA "Ý" -- capital Y, acute accent -->
<!ENTITY THORN CDATA "Þ" -- capital THORN, Icelandic -->
<!ENTITY szlig CDATA "ß" -- small sharp s, German (sz ligature) -->
<!ENTITY agrave CDATA "à" -- small a, grave accent -->
<!ENTITY aacute CDATA "á" -- small a, acute accent -->
<!ENTITY acirc CDATA "â" -- small a, circumflex accent -->
<!ENTITY atilde CDATA "ã" -- small a, tilde -->
<!ENTITY auml CDATA "ä" -- small a, dieresis or umlaut mark -->
<!ENTITY aring CDATA "å" -- small a, ring -->
<!ENTITY aelig CDATA "æ" -- small ae diphthong (ligature) -->
<!ENTITY ccedil CDATA "ç" -- small c, cedilla -->
<!ENTITY egrave CDATA "è" -- small e, grave accent -->
<!ENTITY eacute CDATA "é" -- small e, acute accent -->
<!ENTITY ecirc CDATA "ê" -- small e, circumflex accent -->
<!ENTITY euml CDATA "ë" -- small e, dieresis or umlaut mark -->
<!ENTITY igrave CDATA "ì" -- small i, grave accent -->
<!ENTITY iacute CDATA "í" -- small i, acute accent -->
<!ENTITY icirc CDATA "î" -- small i, circumflex accent -->
<!ENTITY iuml CDATA "ï" -- small i, dieresis or umlaut mark -->
<!ENTITY eth CDATA "ð" -- small eth, Icelandic -->
<!ENTITY ntilde CDATA "ñ" -- small n, tilde -->
<!ENTITY ograve CDATA "ò" -- small o, grave accent -->
Berners-Lee & Connolly Standards Track [Page 76]
RFC 1866 Hypertext Markup Language - 2.0 November 1995
<!ENTITY oacute CDATA "ó" -- small o, acute accent -->
<!ENTITY ocirc CDATA "ô" -- small o, circumflex accent -->
<!ENTITY otilde CDATA "õ" -- small o, tilde -->
<!ENTITY ouml CDATA "ö" -- small o, dieresis or umlaut mark -->
<!ENTITY divide CDATA "÷" -- divide sign -->
<!ENTITY oslash CDATA "ø" -- small o, slash -->
<!ENTITY ugrave CDATA "ù" -- small u, grave accent -->
<!ENTITY uacute CDATA "ú" -- small u, acute accent -->
<!ENTITY ucirc CDATA "û" -- small u, circumflex accent -->
<!ENTITY uuml CDATA "ü" -- small u, dieresis or umlaut mark -->
<!ENTITY yacute CDATA "ý" -- small y, acute accent -->
<!ENTITY thorn CDATA "þ" -- small thorn, Icelandic -->
<!ENTITY yuml CDATA "ÿ" -- small y, dieresis or umlaut mark -->
|