/* * Copyright 1999-2009, Gisle Aas. * * This library is free software; you can redistribute it and/or * modify it under the same terms as Perl itself. */ #ifndef EXTERN #define EXTERN extern #endif EXTERN SV* sv_lower(pTHX_ SV* sv) { STRLEN len; char *s = SvPV_force(sv, len); for (; len--; s++) *s = toLOWER(*s); return sv; } EXTERN int strnEQx(const char* s1, const char* s2, STRLEN n, int ignore_case) { while (n--) { if (ignore_case) { if (toLOWER(*s1) != toLOWER(*s2)) return 0; } else { if (*s1 != *s2) return 0; } s1++; s2++; } return 1; } static void grow_gap(pTHX_ SV* sv, STRLEN grow, char** t, char** s, char** e) { /* SvPVX ---> AAAAAA...BBBBBB ^ ^ ^ t s e */ STRLEN t_offset = *t - SvPVX(sv); STRLEN s_offset = *s - SvPVX(sv); STRLEN e_offset = *e - SvPVX(sv); SvGROW(sv, e_offset + grow + 1); *t = SvPVX(sv) + t_offset; *s = SvPVX(sv) + s_offset; *e = SvPVX(sv) + e_offset; Move(*s, *s+grow, *e - *s, char); *s += grow; *e += grow; } EXTERN SV* decode_entities(pTHX_ SV* sv, HV* entity2char, bool expand_prefix) { STRLEN len; char *s = SvPV_force(sv, len); char *t = s; char *end = s + len; char *ent_start; char *repl; STRLEN repl_len; #ifdef UNICODE_HTML_PARSER char buf[UTF8_MAXLEN]; int repl_utf8; int high_surrogate = 0; #else char buf[1]; #endif #if defined(__GNUC__) && defined(UNICODE_HTML_PARSER) /* gcc -Wall reports this variable as possibly used uninitialized */ repl_utf8 = 0; #endif while (s < end) { assert(t <= s); if ((*t++ = *s++) != '&') continue; ent_start = s; repl = 0; if (s < end && *s == '#') { UV num = 0; int ok = 0; s++; if (s < end && (*s == 'x' || *s == 'X')) { s++; while (s < end) { char *tmp = strchr(PL_hexdigit, *s); if (!tmp) break; num = num << 4 | ((tmp - PL_hexdigit) & 15); if (num > 0x10FFFF) { /* overflow */ ok = 0; break; } s++; ok = 1; } } else { while (s < end && isDIGIT(*s)) { num = num * 10 + (*s - '0'); if (num > 0x10FFFF) { /* overflow */ ok = 0; break; } s++; ok = 1; } } if (num && ok) { #ifdef UNICODE_HTML_PARSER if (!SvUTF8(sv) && num <= 255) { buf[0] = (char) num; repl = buf; repl_len = 1; repl_utf8 = 0; } else if (num == 0xFFFE || num == 0xFFFF) { /* illegal */ } else { char *tmp; if ((num & 0xFFFFFC00) == 0xDC00) { /* low-surrogate */ if (high_surrogate != 0) { t -= 3; /* Back up past 0xFFFD */ num = ((high_surrogate - 0xD800) << 10) + (num - 0xDC00) + 0x10000; high_surrogate = 0; } else { num = 0xFFFD; } } else if ((num & 0xFFFFFC00) == 0xD800) { /* high-surrogate */ high_surrogate = num; num = 0xFFFD; } else { high_surrogate = 0; /* otherwise invalid? */ if ((num >= 0xFDD0 && num <= 0xFDEF) || ((num & 0xFFFE) == 0xFFFE) || num > 0x10FFFF) { num = 0xFFFD; } } tmp = (char*)uvuni_to_utf8((U8*)buf, num); repl = buf; repl_len = tmp - buf; repl_utf8 = 1; } #else if (num <= 255) { buf[0] = (char) num & 0xFF; repl = buf; repl_len = 1; } #endif } } else { char *ent_name = s; while (s < end && isALNUM(*s)) s++; if (ent_name != s && entity2char) { SV** svp; if ( (svp = hv_fetch(entity2char, ent_name, s - ent_name, 0)) || (*s == ';' && (svp = hv_fetch(entity2char, ent_name, s - ent_name + 1, 0))) ) { repl = SvPV(*svp, repl_len); #ifdef UNICODE_HTML_PARSER repl_utf8 = SvUTF8(*svp); #endif } else if (expand_prefix) { char *ss = s - 1; while (ss > ent_name) { svp = hv_fetch(entity2char, ent_name, ss - ent_name, 0); if (svp) { repl = SvPV(*svp, repl_len); #ifdef UNICODE_HTML_PARSER repl_utf8 = SvUTF8(*svp); #endif s = ss; break; } ss--; } } } #ifdef UNICODE_HTML_PARSER high_surrogate = 0; #endif } if (repl) { char *repl_allocated = 0; if (s < end && *s == ';') s++; t--; /* '&' already copied, undo it */ #ifdef UNICODE_HTML_PARSER if (*s != '&') { high_surrogate = 0; } if (!SvUTF8(sv) && repl_utf8) { /* need to upgrade sv before we continue */ STRLEN before_gap_len = t - SvPVX(sv); char *before_gap = (char*)bytes_to_utf8((U8*)SvPVX(sv), &before_gap_len); STRLEN after_gap_len = end - s; char *after_gap = (char*)bytes_to_utf8((U8*)s, &after_gap_len); sv_setpvn(sv, before_gap, before_gap_len); sv_catpvn(sv, after_gap, after_gap_len); SvUTF8_on(sv); Safefree(before_gap); Safefree(after_gap); s = t = SvPVX(sv) + before_gap_len; end = SvPVX(sv) + before_gap_len + after_gap_len; } else if (SvUTF8(sv) && !repl_utf8) { repl = (char*)bytes_to_utf8((U8*)repl, &repl_len); repl_allocated = repl; } #endif if (t + repl_len > s) { /* need to grow the string */ grow_gap(aTHX_ sv, repl_len - (s - t), &t, &s, &end); } /* copy replacement string into string */ while (repl_len--) *t++ = *repl++; if (repl_allocated) Safefree(repl_allocated); } else { while (ent_start < s) *t++ = *ent_start++; } } *t = '\0'; SvCUR_set(sv, t - SvPVX(sv)); return sv; } #ifdef UNICODE_HTML_PARSER static bool has_hibit(char *s, char *e) { while (s < e) { U8 ch = *s++; if (!UTF8_IS_INVARIANT(ch)) { return 1; } } return 0; } EXTERN bool probably_utf8_chunk(pTHX_ char *s, STRLEN len) { char *e = s + len; STRLEN clen; /* ignore partial utf8 char at end of buffer */ while (s < e && UTF8_IS_CONTINUATION((U8)*(e - 1))) e--; if (s < e && UTF8_IS_START((U8)*(e - 1))) e--; clen = len - (e - s); if (clen && UTF8SKIP(e) == clen) { /* all promised continuation bytes are present */ e = s + len; } if (!has_hibit(s, e)) return 0; return is_utf8_string((U8*)s, e - s); } #endif