summaryrefslogtreecommitdiff
path: root/lib/HTML/Entities.pm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/HTML/Entities.pm')
-rw-r--r--lib/HTML/Entities.pm483
1 files changed, 483 insertions, 0 deletions
diff --git a/lib/HTML/Entities.pm b/lib/HTML/Entities.pm
new file mode 100644
index 0000000..ecd8e0d
--- /dev/null
+++ b/lib/HTML/Entities.pm
@@ -0,0 +1,483 @@
+package HTML::Entities;
+
+=encoding utf8
+
+=head1 NAME
+
+HTML::Entities - Encode or decode strings with HTML entities
+
+=head1 SYNOPSIS
+
+ use HTML::Entities;
+
+ $a = "Våre norske tegn bør &#230res";
+ decode_entities($a);
+ encode_entities($a, "\200-\377");
+
+For example, this:
+
+ $input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé";
+ print encode_entities($input), "\n"
+
+Prints this out:
+
+ vis-à-vis Beyoncé's naïve
+ papier-mâché résumé
+
+=head1 DESCRIPTION
+
+This module deals with encoding and decoding of strings with HTML
+character entities. The module provides the following functions:
+
+=over 4
+
+=item decode_entities( $string, ... )
+
+This routine replaces HTML entities found in the $string with the
+corresponding Unicode character. Unrecognized entities are left alone.
+
+If multiple strings are provided as argument they are each decoded
+separately and the same number of strings are returned.
+
+If called in void context the arguments are decoded in-place.
+
+This routine is exported by default.
+
+=item _decode_entities( $string, \%entity2char )
+
+=item _decode_entities( $string, \%entity2char, $expand_prefix )
+
+This will in-place replace HTML entities in $string. The %entity2char
+hash must be provided. Named entities not found in the %entity2char
+hash are left alone. Numeric entities are expanded unless their value
+overflow.
+
+The keys in %entity2char are the entity names to be expanded and their
+values are what they should expand into. The values do not have to be
+single character strings. If a key has ";" as suffix,
+then occurrences in $string are only expanded if properly terminated
+with ";". Entities without ";" will be expanded regardless of how
+they are terminated for compatibility with how common browsers treat
+entities in the Latin-1 range.
+
+If $expand_prefix is TRUE then entities without trailing ";" in
+%entity2char will even be expanded as a prefix of a longer
+unrecognized name. The longest matching name in %entity2char will be
+used. This is mainly present for compatibility with an MSIE
+misfeature.
+
+ $string = "foo&nbspbar";
+ _decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
+ print $string; # will print "foo bar"
+
+This routine is exported by default.
+
+=item encode_entities( $string )
+
+=item encode_entities( $string, $unsafe_chars )
+
+This routine replaces unsafe characters in $string with their entity
+representation. A second argument can be given to specify which characters to
+consider unsafe. The unsafe characters is specified using the regular
+expression character class syntax (what you find within brackets in regular
+expressions).
+
+The default set of characters to encode are control chars, high-bit chars, and
+the C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> characters. But this,
+for example, would encode I<just> the C<< < >>, C<< & >>, C<< > >>, and C<< "
+>> characters:
+
+ $encoded = encode_entities($input, '<>&"');
+
+and this would only encode non-plain ascii:
+
+ $encoded = encode_entities($input, '^\n\x20-\x25\x27-\x7e');
+
+This routine is exported by default.
+
+=item encode_entities_numeric( $string )
+
+=item encode_entities_numeric( $string, $unsafe_chars )
+
+This routine works just like encode_entities, except that the replacement
+entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For
+example, C<encode_entities("r\xF4le")> returns "r&ocirc;le", but
+C<encode_entities_numeric("r\xF4le")> returns "r&#xF4;le".
+
+This routine is I<not> exported by default. But you can always
+export it with C<use HTML::Entities qw(encode_entities_numeric);>
+or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
+
+=back
+
+All these routines modify the string passed as the first argument, if
+called in a void context. In scalar and array contexts, the encoded or
+decoded string is returned (without changing the input string).
+
+If you prefer not to import these routines into your namespace, you can
+call them as:
+
+ use HTML::Entities ();
+ $decoded = HTML::Entities::decode($a);
+ $encoded = HTML::Entities::encode($a);
+ $encoded = HTML::Entities::encode_numeric($a);
+
+The module can also export the %char2entity and the %entity2char
+hashes, which contain the mapping from all characters to the
+corresponding entities (and vice versa, respectively).
+
+=head1 COPYRIGHT
+
+Copyright 1995-2006 Gisle Aas. All rights reserved.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
+
+use strict;
+use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
+use vars qw(%entity2char %char2entity);
+
+require 5.004;
+require Exporter;
+@ISA = qw(Exporter);
+
+@EXPORT = qw(encode_entities decode_entities _decode_entities);
+@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
+
+$VERSION = "3.69";
+sub Version { $VERSION; }
+
+require HTML::Parser; # for fast XS implemented decode_entities
+
+
+%entity2char = (
+ # Some normal chars that have special meaning in SGML context
+ amp => '&', # ampersand
+'gt' => '>', # greater than
+'lt' => '<', # less than
+ quot => '"', # double quote
+ apos => "'", # single quote
+
+ # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
+ AElig => chr(198), # capital AE diphthong (ligature)
+ Aacute => chr(193), # capital A, acute accent
+ Acirc => chr(194), # capital A, circumflex accent
+ Agrave => chr(192), # capital A, grave accent
+ Aring => chr(197), # capital A, ring
+ Atilde => chr(195), # capital A, tilde
+ Auml => chr(196), # capital A, dieresis or umlaut mark
+ Ccedil => chr(199), # capital C, cedilla
+ ETH => chr(208), # capital Eth, Icelandic
+ Eacute => chr(201), # capital E, acute accent
+ Ecirc => chr(202), # capital E, circumflex accent
+ Egrave => chr(200), # capital E, grave accent
+ Euml => chr(203), # capital E, dieresis or umlaut mark
+ Iacute => chr(205), # capital I, acute accent
+ Icirc => chr(206), # capital I, circumflex accent
+ Igrave => chr(204), # capital I, grave accent
+ Iuml => chr(207), # capital I, dieresis or umlaut mark
+ Ntilde => chr(209), # capital N, tilde
+ Oacute => chr(211), # capital O, acute accent
+ Ocirc => chr(212), # capital O, circumflex accent
+ Ograve => chr(210), # capital O, grave accent
+ Oslash => chr(216), # capital O, slash
+ Otilde => chr(213), # capital O, tilde
+ Ouml => chr(214), # capital O, dieresis or umlaut mark
+ THORN => chr(222), # capital THORN, Icelandic
+ Uacute => chr(218), # capital U, acute accent
+ Ucirc => chr(219), # capital U, circumflex accent
+ Ugrave => chr(217), # capital U, grave accent
+ Uuml => chr(220), # capital U, dieresis or umlaut mark
+ Yacute => chr(221), # capital Y, acute accent
+ aacute => chr(225), # small a, acute accent
+ acirc => chr(226), # small a, circumflex accent
+ aelig => chr(230), # small ae diphthong (ligature)
+ agrave => chr(224), # small a, grave accent
+ aring => chr(229), # small a, ring
+ atilde => chr(227), # small a, tilde
+ auml => chr(228), # small a, dieresis or umlaut mark
+ ccedil => chr(231), # small c, cedilla
+ eacute => chr(233), # small e, acute accent
+ ecirc => chr(234), # small e, circumflex accent
+ egrave => chr(232), # small e, grave accent
+ eth => chr(240), # small eth, Icelandic
+ euml => chr(235), # small e, dieresis or umlaut mark
+ iacute => chr(237), # small i, acute accent
+ icirc => chr(238), # small i, circumflex accent
+ igrave => chr(236), # small i, grave accent
+ iuml => chr(239), # small i, dieresis or umlaut mark
+ ntilde => chr(241), # small n, tilde
+ oacute => chr(243), # small o, acute accent
+ ocirc => chr(244), # small o, circumflex accent
+ ograve => chr(242), # small o, grave accent
+ oslash => chr(248), # small o, slash
+ otilde => chr(245), # small o, tilde
+ ouml => chr(246), # small o, dieresis or umlaut mark
+ szlig => chr(223), # small sharp s, German (sz ligature)
+ thorn => chr(254), # small thorn, Icelandic
+ uacute => chr(250), # small u, acute accent
+ ucirc => chr(251), # small u, circumflex accent
+ ugrave => chr(249), # small u, grave accent
+ uuml => chr(252), # small u, dieresis or umlaut mark
+ yacute => chr(253), # small y, acute accent
+ yuml => chr(255), # small y, dieresis or umlaut mark
+
+ # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
+ copy => chr(169), # copyright sign
+ reg => chr(174), # registered sign
+ nbsp => chr(160), # non breaking space
+
+ # Additional ISO-8859/1 entities listed in rfc1866 (section 14)
+ iexcl => chr(161),
+ cent => chr(162),
+ pound => chr(163),
+ curren => chr(164),
+ yen => chr(165),
+ brvbar => chr(166),
+ sect => chr(167),
+ uml => chr(168),
+ ordf => chr(170),
+ laquo => chr(171),
+'not' => chr(172), # not is a keyword in perl
+ shy => chr(173),
+ macr => chr(175),
+ deg => chr(176),
+ plusmn => chr(177),
+ sup1 => chr(185),
+ sup2 => chr(178),
+ sup3 => chr(179),
+ acute => chr(180),
+ micro => chr(181),
+ para => chr(182),
+ middot => chr(183),
+ cedil => chr(184),
+ ordm => chr(186),
+ raquo => chr(187),
+ frac14 => chr(188),
+ frac12 => chr(189),
+ frac34 => chr(190),
+ iquest => chr(191),
+'times' => chr(215), # times is a keyword in perl
+ divide => chr(247),
+
+ ( $] > 5.007 ? (
+ 'OElig;' => chr(338),
+ 'oelig;' => chr(339),
+ 'Scaron;' => chr(352),
+ 'scaron;' => chr(353),
+ 'Yuml;' => chr(376),
+ 'fnof;' => chr(402),
+ 'circ;' => chr(710),
+ 'tilde;' => chr(732),
+ 'Alpha;' => chr(913),
+ 'Beta;' => chr(914),
+ 'Gamma;' => chr(915),
+ 'Delta;' => chr(916),
+ 'Epsilon;' => chr(917),
+ 'Zeta;' => chr(918),
+ 'Eta;' => chr(919),
+ 'Theta;' => chr(920),
+ 'Iota;' => chr(921),
+ 'Kappa;' => chr(922),
+ 'Lambda;' => chr(923),
+ 'Mu;' => chr(924),
+ 'Nu;' => chr(925),
+ 'Xi;' => chr(926),
+ 'Omicron;' => chr(927),
+ 'Pi;' => chr(928),
+ 'Rho;' => chr(929),
+ 'Sigma;' => chr(931),
+ 'Tau;' => chr(932),
+ 'Upsilon;' => chr(933),
+ 'Phi;' => chr(934),
+ 'Chi;' => chr(935),
+ 'Psi;' => chr(936),
+ 'Omega;' => chr(937),
+ 'alpha;' => chr(945),
+ 'beta;' => chr(946),
+ 'gamma;' => chr(947),
+ 'delta;' => chr(948),
+ 'epsilon;' => chr(949),
+ 'zeta;' => chr(950),
+ 'eta;' => chr(951),
+ 'theta;' => chr(952),
+ 'iota;' => chr(953),
+ 'kappa;' => chr(954),
+ 'lambda;' => chr(955),
+ 'mu;' => chr(956),
+ 'nu;' => chr(957),
+ 'xi;' => chr(958),
+ 'omicron;' => chr(959),
+ 'pi;' => chr(960),
+ 'rho;' => chr(961),
+ 'sigmaf;' => chr(962),
+ 'sigma;' => chr(963),
+ 'tau;' => chr(964),
+ 'upsilon;' => chr(965),
+ 'phi;' => chr(966),
+ 'chi;' => chr(967),
+ 'psi;' => chr(968),
+ 'omega;' => chr(969),
+ 'thetasym;' => chr(977),
+ 'upsih;' => chr(978),
+ 'piv;' => chr(982),
+ 'ensp;' => chr(8194),
+ 'emsp;' => chr(8195),
+ 'thinsp;' => chr(8201),
+ 'zwnj;' => chr(8204),
+ 'zwj;' => chr(8205),
+ 'lrm;' => chr(8206),
+ 'rlm;' => chr(8207),
+ 'ndash;' => chr(8211),
+ 'mdash;' => chr(8212),
+ 'lsquo;' => chr(8216),
+ 'rsquo;' => chr(8217),
+ 'sbquo;' => chr(8218),
+ 'ldquo;' => chr(8220),
+ 'rdquo;' => chr(8221),
+ 'bdquo;' => chr(8222),
+ 'dagger;' => chr(8224),
+ 'Dagger;' => chr(8225),
+ 'bull;' => chr(8226),
+ 'hellip;' => chr(8230),
+ 'permil;' => chr(8240),
+ 'prime;' => chr(8242),
+ 'Prime;' => chr(8243),
+ 'lsaquo;' => chr(8249),
+ 'rsaquo;' => chr(8250),
+ 'oline;' => chr(8254),
+ 'frasl;' => chr(8260),
+ 'euro;' => chr(8364),
+ 'image;' => chr(8465),
+ 'weierp;' => chr(8472),
+ 'real;' => chr(8476),
+ 'trade;' => chr(8482),
+ 'alefsym;' => chr(8501),
+ 'larr;' => chr(8592),
+ 'uarr;' => chr(8593),
+ 'rarr;' => chr(8594),
+ 'darr;' => chr(8595),
+ 'harr;' => chr(8596),
+ 'crarr;' => chr(8629),
+ 'lArr;' => chr(8656),
+ 'uArr;' => chr(8657),
+ 'rArr;' => chr(8658),
+ 'dArr;' => chr(8659),
+ 'hArr;' => chr(8660),
+ 'forall;' => chr(8704),
+ 'part;' => chr(8706),
+ 'exist;' => chr(8707),
+ 'empty;' => chr(8709),
+ 'nabla;' => chr(8711),
+ 'isin;' => chr(8712),
+ 'notin;' => chr(8713),
+ 'ni;' => chr(8715),
+ 'prod;' => chr(8719),
+ 'sum;' => chr(8721),
+ 'minus;' => chr(8722),
+ 'lowast;' => chr(8727),
+ 'radic;' => chr(8730),
+ 'prop;' => chr(8733),
+ 'infin;' => chr(8734),
+ 'ang;' => chr(8736),
+ 'and;' => chr(8743),
+ 'or;' => chr(8744),
+ 'cap;' => chr(8745),
+ 'cup;' => chr(8746),
+ 'int;' => chr(8747),
+ 'there4;' => chr(8756),
+ 'sim;' => chr(8764),
+ 'cong;' => chr(8773),
+ 'asymp;' => chr(8776),
+ 'ne;' => chr(8800),
+ 'equiv;' => chr(8801),
+ 'le;' => chr(8804),
+ 'ge;' => chr(8805),
+ 'sub;' => chr(8834),
+ 'sup;' => chr(8835),
+ 'nsub;' => chr(8836),
+ 'sube;' => chr(8838),
+ 'supe;' => chr(8839),
+ 'oplus;' => chr(8853),
+ 'otimes;' => chr(8855),
+ 'perp;' => chr(8869),
+ 'sdot;' => chr(8901),
+ 'lceil;' => chr(8968),
+ 'rceil;' => chr(8969),
+ 'lfloor;' => chr(8970),
+ 'rfloor;' => chr(8971),
+ 'lang;' => chr(9001),
+ 'rang;' => chr(9002),
+ 'loz;' => chr(9674),
+ 'spades;' => chr(9824),
+ 'clubs;' => chr(9827),
+ 'hearts;' => chr(9829),
+ 'diams;' => chr(9830),
+ ) : ())
+);
+
+
+# Make the opposite mapping
+while (my($entity, $char) = each(%entity2char)) {
+ $entity =~ s/;\z//;
+ $char2entity{$char} = "&$entity;";
+}
+delete $char2entity{"'"}; # only one-way decoding
+
+# Fill in missing entities
+for (0 .. 255) {
+ next if exists $char2entity{chr($_)};
+ $char2entity{chr($_)} = "&#$_;";
+}
+
+my %subst; # compiled encoding regexps
+
+sub encode_entities
+{
+ return undef unless defined $_[0];
+ my $ref;
+ if (defined wantarray) {
+ my $x = $_[0];
+ $ref = \$x; # copy
+ } else {
+ $ref = \$_[0]; # modify in-place
+ }
+ if (defined $_[1] and length $_[1]) {
+ unless (exists $subst{$_[1]}) {
+ # Because we can't compile regex we fake it with a cached sub
+ my $chars = $_[1];
+ $chars =~ s,(?<!\\)([]/]),\\$1,g;
+ $chars =~ s,(?<!\\)\\\z,\\\\,;
+ my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
+ $subst{$_[1]} = eval $code;
+ die( $@ . " while trying to turn range: \"$_[1]\"\n "
+ . "into code: $code\n "
+ ) if $@;
+ }
+ &{$subst{$_[1]}}($$ref);
+ } else {
+ # Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
+ $$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
+ }
+ $$ref;
+}
+
+sub encode_entities_numeric {
+ local %char2entity;
+ return &encode_entities; # a goto &encode_entities wouldn't work
+}
+
+
+sub num_entity {
+ sprintf "&#x%X;", ord($_[0]);
+}
+
+# Set up aliases
+*encode = \&encode_entities;
+*encode_numeric = \&encode_entities_numeric;
+*encode_numerically = \&encode_entities_numeric;
+*decode = \&decode_entities;
+
+1;