diff options
Diffstat (limited to 'util.c')
-rw-r--r-- | util.c | 311 |
1 files changed, 311 insertions, 0 deletions
@@ -0,0 +1,311 @@ +/* + * Copyright 1999-2009, Gisle Aas. + * + * This library is free software; you can redistribute it and/or + * modify it under the same terms as Perl itself. + */ + +#ifndef EXTERN +#define EXTERN extern +#endif + + +EXTERN SV* +sv_lower(pTHX_ SV* sv) +{ + STRLEN len; + char *s = SvPV_force(sv, len); + for (; len--; s++) + *s = toLOWER(*s); + return sv; +} + +EXTERN int +strnEQx(const char* s1, const char* s2, STRLEN n, int ignore_case) +{ + while (n--) { + if (ignore_case) { + if (toLOWER(*s1) != toLOWER(*s2)) + return 0; + } + else { + if (*s1 != *s2) + return 0; + } + s1++; + s2++; + } + return 1; +} + +static void +grow_gap(pTHX_ SV* sv, STRLEN grow, char** t, char** s, char** e) +{ + /* + SvPVX ---> AAAAAA...BBBBBB + ^ ^ ^ + t s e + */ + STRLEN t_offset = *t - SvPVX(sv); + STRLEN s_offset = *s - SvPVX(sv); + STRLEN e_offset = *e - SvPVX(sv); + + SvGROW(sv, e_offset + grow + 1); + + *t = SvPVX(sv) + t_offset; + *s = SvPVX(sv) + s_offset; + *e = SvPVX(sv) + e_offset; + + Move(*s, *s+grow, *e - *s, char); + *s += grow; + *e += grow; +} + +EXTERN SV* +decode_entities(pTHX_ SV* sv, HV* entity2char, bool expand_prefix) +{ + STRLEN len; + char *s = SvPV_force(sv, len); + char *t = s; + char *end = s + len; + char *ent_start; + + char *repl; + STRLEN repl_len; +#ifdef UNICODE_HTML_PARSER + char buf[UTF8_MAXLEN]; + int repl_utf8; + int high_surrogate = 0; +#else + char buf[1]; +#endif + +#if defined(__GNUC__) && defined(UNICODE_HTML_PARSER) + /* gcc -Wall reports this variable as possibly used uninitialized */ + repl_utf8 = 0; +#endif + + while (s < end) { + assert(t <= s); + + if ((*t++ = *s++) != '&') + continue; + + ent_start = s; + repl = 0; + + if (s < end && *s == '#') { + UV num = 0; + int ok = 0; + s++; + if (s < end && (*s == 'x' || *s == 'X')) { + s++; + while (s < end) { + char *tmp = strchr(PL_hexdigit, *s); + if (!tmp) + break; + num = num << 4 | ((tmp - PL_hexdigit) & 15); + if (num > 0x10FFFF) { + /* overflow */ + ok = 0; + break; + } + s++; + ok = 1; + } + } + else { + while (s < end && isDIGIT(*s)) { + num = num * 10 + (*s - '0'); + if (num > 0x10FFFF) { + /* overflow */ + ok = 0; + break; + } + s++; + ok = 1; + } + } + if (num && ok) { +#ifdef UNICODE_HTML_PARSER + if (!SvUTF8(sv) && num <= 255) { + buf[0] = (char) num; + repl = buf; + repl_len = 1; + repl_utf8 = 0; + } + else if (num == 0xFFFE || num == 0xFFFF) { + /* illegal */ + } + else { + char *tmp; + if ((num & 0xFFFFFC00) == 0xDC00) { /* low-surrogate */ + if (high_surrogate != 0) { + t -= 3; /* Back up past 0xFFFD */ + num = ((high_surrogate - 0xD800) << 10) + + (num - 0xDC00) + 0x10000; + high_surrogate = 0; + } else { + num = 0xFFFD; + } + } + else if ((num & 0xFFFFFC00) == 0xD800) { /* high-surrogate */ + high_surrogate = num; + num = 0xFFFD; + } + else { + high_surrogate = 0; + /* otherwise invalid? */ + if ((num >= 0xFDD0 && num <= 0xFDEF) || + ((num & 0xFFFE) == 0xFFFE) || + num > 0x10FFFF) + { + num = 0xFFFD; + } + } + + tmp = (char*)uvuni_to_utf8((U8*)buf, num); + repl = buf; + repl_len = tmp - buf; + repl_utf8 = 1; + } +#else + if (num <= 255) { + buf[0] = (char) num & 0xFF; + repl = buf; + repl_len = 1; + } +#endif + } + } + else { + char *ent_name = s; + while (s < end && isALNUM(*s)) + s++; + if (ent_name != s && entity2char) { + SV** svp; + if ( (svp = hv_fetch(entity2char, ent_name, s - ent_name, 0)) || + (*s == ';' && (svp = hv_fetch(entity2char, ent_name, s - ent_name + 1, 0))) + ) + { + repl = SvPV(*svp, repl_len); +#ifdef UNICODE_HTML_PARSER + repl_utf8 = SvUTF8(*svp); +#endif + } + else if (expand_prefix) { + char *ss = s - 1; + while (ss > ent_name) { + svp = hv_fetch(entity2char, ent_name, ss - ent_name, 0); + if (svp) { + repl = SvPV(*svp, repl_len); +#ifdef UNICODE_HTML_PARSER + repl_utf8 = SvUTF8(*svp); +#endif + s = ss; + break; + } + ss--; + } + } + } +#ifdef UNICODE_HTML_PARSER + high_surrogate = 0; +#endif + } + + if (repl) { + char *repl_allocated = 0; + if (s < end && *s == ';') + s++; + t--; /* '&' already copied, undo it */ + +#ifdef UNICODE_HTML_PARSER + if (*s != '&') { + high_surrogate = 0; + } + + if (!SvUTF8(sv) && repl_utf8) { + /* need to upgrade sv before we continue */ + STRLEN before_gap_len = t - SvPVX(sv); + char *before_gap = (char*)bytes_to_utf8((U8*)SvPVX(sv), &before_gap_len); + STRLEN after_gap_len = end - s; + char *after_gap = (char*)bytes_to_utf8((U8*)s, &after_gap_len); + + sv_setpvn(sv, before_gap, before_gap_len); + sv_catpvn(sv, after_gap, after_gap_len); + SvUTF8_on(sv); + + Safefree(before_gap); + Safefree(after_gap); + + s = t = SvPVX(sv) + before_gap_len; + end = SvPVX(sv) + before_gap_len + after_gap_len; + } + else if (SvUTF8(sv) && !repl_utf8) { + repl = (char*)bytes_to_utf8((U8*)repl, &repl_len); + repl_allocated = repl; + } +#endif + + if (t + repl_len > s) { + /* need to grow the string */ + grow_gap(aTHX_ sv, repl_len - (s - t), &t, &s, &end); + } + + /* copy replacement string into string */ + while (repl_len--) + *t++ = *repl++; + + if (repl_allocated) + Safefree(repl_allocated); + } + else { + while (ent_start < s) + *t++ = *ent_start++; + } + } + + *t = '\0'; + SvCUR_set(sv, t - SvPVX(sv)); + + return sv; +} + +#ifdef UNICODE_HTML_PARSER +static bool +has_hibit(char *s, char *e) +{ + while (s < e) { + U8 ch = *s++; + if (!UTF8_IS_INVARIANT(ch)) { + return 1; + } + } + return 0; +} + + +EXTERN bool +probably_utf8_chunk(pTHX_ char *s, STRLEN len) +{ + char *e = s + len; + STRLEN clen; + + /* ignore partial utf8 char at end of buffer */ + while (s < e && UTF8_IS_CONTINUATION((U8)*(e - 1))) + e--; + if (s < e && UTF8_IS_START((U8)*(e - 1))) + e--; + clen = len - (e - s); + if (clen && UTF8SKIP(e) == clen) { + /* all promised continuation bytes are present */ + e = s + len; + } + + if (!has_hibit(s, e)) + return 0; + + return is_utf8_string((U8*)s, e - s); +} +#endif |