/* * Copyright 1999-2009, Gisle Aas. * Copyright 1999-2000, Michael A. Chase. * * This library is free software; you can redistribute it and/or * modify it under the same terms as Perl itself. */ /* * Standard XS greeting. */ #ifdef __cplusplus extern "C" { #endif #define PERL_NO_GET_CONTEXT /* we want efficiency */ #include "EXTERN.h" #include "perl.h" #include "XSUB.h" #ifdef __cplusplus } #endif /* * Some perl version compatibility gruff. */ #include "patchlevel.h" #if PATCHLEVEL <= 4 /* perl5.004_XX */ #ifndef PL_sv_undef #define PL_sv_undef sv_undef #define PL_sv_yes sv_yes #endif #ifndef PL_hexdigit #define PL_hexdigit hexdigit #endif #ifndef ERRSV #define ERRSV GvSV(errgv) #endif #if (PATCHLEVEL == 4 && SUBVERSION <= 4) /* The newSVpvn function was introduced in perl5.004_05 */ static SV * newSVpvn(char *s, STRLEN len) { register SV *sv = newSV(0); sv_setpvn(sv,s,len); return sv; } #endif /* not perl5.004_05 */ #endif /* perl5.004_XX */ #ifndef dNOOP #define dNOOP extern int errno #endif #ifndef dTHX #define dTHX dNOOP #define pTHX_ #define aTHX_ #endif #ifndef MEMBER_TO_FPTR #define MEMBER_TO_FPTR(x) (x) #endif #ifndef INT2PTR #define INT2PTR(any,d) (any)(d) #define PTR2IV(p) (IV)(p) #endif #if PATCHLEVEL > 6 || (PATCHLEVEL == 6 && SUBVERSION > 0) #define RETHROW croak(Nullch) #else #define RETHROW { STRLEN my_na; croak("%s", SvPV(ERRSV, my_na)); } #endif #if PATCHLEVEL < 8 /* No useable Unicode support */ /* Make these harmless if present */ #undef SvUTF8 #undef SvUTF8_on #undef SvUTF8_off #define SvUTF8(sv) 0 #define SvUTF8_on(sv) 0 #define SvUTF8_off(sv) 0 #else #define UNICODE_HTML_PARSER #endif #ifdef G_WARN_ON #define DOWARN (PL_dowarn & G_WARN_ON) #else #define DOWARN PL_dowarn #endif #ifndef CLONEf_JOIN_IN #define CLONEf_JOIN_IN 0 #endif /* * Include stuff. We include .c files instead of linking them, * so that they don't have to pollute the external dll name space. */ #ifdef EXTERN #undef EXTERN #endif #define EXTERN static /* Don't pollute */ #include "hparser.h" #include "util.c" #include "hparser.c" /* * Support functions for the XS glue */ static SV* check_handler(pTHX_ SV* h) { SvGETMAGIC(h); if (SvROK(h)) { SV* myref = SvRV(h); if (SvTYPE(myref) == SVt_PVCV) return newSVsv(h); if (SvTYPE(myref) == SVt_PVAV) return SvREFCNT_inc(myref); croak("Only code or array references allowed as handler"); } return SvOK(h) ? newSVsv(h) : 0; } static PSTATE* get_pstate_iv(pTHX_ SV* sv) { PSTATE *p; #if PATCHLEVEL < 8 p = INT2PTR(PSTATE*, SvIV(sv)); #else MAGIC *mg = SvMAGICAL(sv) ? mg_find(sv, '~') : NULL; if (!mg) croak("Lost parser state magic"); p = (PSTATE *)mg->mg_ptr; if (!p) croak("Lost parser state magic"); #endif if (p->signature != P_SIGNATURE) croak("Bad signature in parser state object at %p", p); return p; } static PSTATE* get_pstate_hv(pTHX_ SV* sv) /* used by XS typemap */ { HV* hv; SV** svp; sv = SvRV(sv); if (!sv || SvTYPE(sv) != SVt_PVHV) croak("Not a reference to a hash"); hv = (HV*)sv; svp = hv_fetch(hv, "_hparser_xs_state", 17, 0); if (svp) { if (SvROK(*svp)) return get_pstate_iv(aTHX_ SvRV(*svp)); else croak("_hparser_xs_state element is not a reference"); } croak("Can't find '_hparser_xs_state' element in HTML::Parser hash"); return 0; } static void free_pstate(pTHX_ PSTATE* pstate) { int i; SvREFCNT_dec(pstate->buf); SvREFCNT_dec(pstate->pend_text); SvREFCNT_dec(pstate->skipped_text); #ifdef MARKED_SECTION SvREFCNT_dec(pstate->ms_stack); #endif SvREFCNT_dec(pstate->bool_attr_val); for (i = 0; i < EVENT_COUNT; i++) { SvREFCNT_dec(pstate->handlers[i].cb); SvREFCNT_dec(pstate->handlers[i].argspec); } SvREFCNT_dec(pstate->report_tags); SvREFCNT_dec(pstate->ignore_tags); SvREFCNT_dec(pstate->ignore_elements); SvREFCNT_dec(pstate->ignoring_element); SvREFCNT_dec(pstate->tmp); pstate->signature = 0; Safefree(pstate); } static int magic_free_pstate(pTHX_ SV *sv, MAGIC *mg) { #if PATCHLEVEL < 8 free_pstate(aTHX_ get_pstate_iv(aTHX_ sv)); #else free_pstate(aTHX_ (PSTATE *)mg->mg_ptr); #endif return 0; } #if defined(USE_ITHREADS) && PATCHLEVEL >= 8 static PSTATE * dup_pstate(pTHX_ PSTATE *pstate, CLONE_PARAMS *params) { PSTATE *pstate2; int i; Newz(56, pstate2, 1, PSTATE); pstate2->signature = pstate->signature; pstate2->buf = SvREFCNT_inc(sv_dup(pstate->buf, params)); pstate2->offset = pstate->offset; pstate2->line = pstate->line; pstate2->column = pstate->column; pstate2->start_document = pstate->start_document; pstate2->parsing = pstate->parsing; pstate2->eof = pstate->eof; pstate2->literal_mode = pstate->literal_mode; pstate2->is_cdata = pstate->is_cdata; pstate2->no_dash_dash_comment_end = pstate->no_dash_dash_comment_end; pstate2->pending_end_tag = pstate->pending_end_tag; pstate2->pend_text = SvREFCNT_inc(sv_dup(pstate->pend_text, params)); pstate2->pend_text_is_cdata = pstate->pend_text_is_cdata; pstate2->pend_text_offset = pstate->pend_text_offset; pstate2->pend_text_line = pstate->pend_text_offset; pstate2->pend_text_column = pstate->pend_text_column; pstate2->skipped_text = SvREFCNT_inc(sv_dup(pstate->skipped_text, params)); #ifdef MARKED_SECTION pstate2->ms = pstate->ms; pstate2->ms_stack = (AV *)SvREFCNT_inc(sv_dup((SV *)pstate->ms_stack, params)); pstate2->marked_sections = pstate->marked_sections; #endif pstate2->strict_comment = pstate->strict_comment; pstate2->strict_names = pstate->strict_names; pstate2->strict_end = pstate->strict_end; pstate2->xml_mode = pstate->xml_mode; pstate2->unbroken_text = pstate->unbroken_text; pstate2->attr_encoded = pstate->attr_encoded; pstate2->case_sensitive = pstate->case_sensitive; pstate2->closing_plaintext = pstate->closing_plaintext; pstate2->utf8_mode = pstate->utf8_mode; pstate2->empty_element_tags = pstate->empty_element_tags; pstate2->xml_pic = pstate->xml_pic; pstate2->backquote = pstate->backquote; pstate2->bool_attr_val = SvREFCNT_inc(sv_dup(pstate->bool_attr_val, params)); for (i = 0; i < EVENT_COUNT; i++) { pstate2->handlers[i].cb = SvREFCNT_inc(sv_dup(pstate->handlers[i].cb, params)); pstate2->handlers[i].argspec = SvREFCNT_inc(sv_dup(pstate->handlers[i].argspec, params)); } pstate2->argspec_entity_decode = pstate->argspec_entity_decode; pstate2->report_tags = (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->report_tags, params)); pstate2->ignore_tags = (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->ignore_tags, params)); pstate2->ignore_elements = (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->ignore_elements, params)); pstate2->ignoring_element = SvREFCNT_inc(sv_dup(pstate->ignoring_element, params)); pstate2->ignore_depth = pstate->ignore_depth; if (params->flags & CLONEf_JOIN_IN) { pstate2->entity2char = perl_get_hv("HTML::Entities::entity2char", TRUE); } else { pstate2->entity2char = (HV *)sv_dup((SV *)pstate->entity2char, params); } pstate2->tmp = SvREFCNT_inc(sv_dup(pstate->tmp, params)); return pstate2; } static int magic_dup_pstate(pTHX_ MAGIC *mg, CLONE_PARAMS *params) { mg->mg_ptr = (char *)dup_pstate(aTHX_ (PSTATE *)mg->mg_ptr, params); return 0; } #endif MGVTBL vtbl_pstate = { 0, 0, 0, 0, MEMBER_TO_FPTR(magic_free_pstate), #if defined(USE_ITHREADS) && PATCHLEVEL >= 8 0, MEMBER_TO_FPTR(magic_dup_pstate), #endif }; /* * XS interface definition. */ MODULE = HTML::Parser PACKAGE = HTML::Parser PROTOTYPES: DISABLE void _alloc_pstate(self) SV* self; PREINIT: PSTATE* pstate; SV* sv; HV* hv; MAGIC* mg; CODE: sv = SvRV(self); if (!sv || SvTYPE(sv) != SVt_PVHV) croak("Not a reference to a hash"); hv = (HV*)sv; Newz(56, pstate, 1, PSTATE); pstate->signature = P_SIGNATURE; pstate->entity2char = perl_get_hv("HTML::Entities::entity2char", TRUE); pstate->tmp = NEWSV(0, 20); sv = newSViv(PTR2IV(pstate)); #if PATCHLEVEL < 8 sv_magic(sv, 0, '~', 0, 0); #else sv_magic(sv, 0, '~', (char *)pstate, 0); #endif mg = mg_find(sv, '~'); assert(mg); mg->mg_virtual = &vtbl_pstate; #if defined(USE_ITHREADS) && PATCHLEVEL >= 8 mg->mg_flags |= MGf_DUP; #endif SvREADONLY_on(sv); hv_store(hv, "_hparser_xs_state", 17, newRV_noinc(sv), 0); void parse(self, chunk) SV* self; SV* chunk PREINIT: PSTATE* p_state = get_pstate_hv(aTHX_ self); PPCODE: if (p_state->parsing) croak("Parse loop not allowed"); p_state->parsing = 1; if (SvROK(chunk) && SvTYPE(SvRV(chunk)) == SVt_PVCV) { SV* generator = chunk; STRLEN len; do { int count; PUSHMARK(SP); count = perl_call_sv(generator, G_SCALAR|G_EVAL); SPAGAIN; chunk = count ? POPs : 0; PUTBACK; if (SvTRUE(ERRSV)) { p_state->parsing = 0; p_state->eof = 0; RETHROW; } if (chunk && SvOK(chunk)) { (void)SvPV(chunk, len); /* get length */ } else { len = 0; } parse(aTHX_ p_state, len ? chunk : 0, self); SPAGAIN; } while (len && !p_state->eof); } else { parse(aTHX_ p_state, chunk, self); SPAGAIN; } p_state->parsing = 0; if (p_state->eof) { p_state->eof = 0; PUSHs(sv_newmortal()); } else { PUSHs(self); } void eof(self) SV* self; PREINIT: PSTATE* p_state = get_pstate_hv(aTHX_ self); PPCODE: if (p_state->parsing) p_state->eof = 1; else { p_state->parsing = 1; parse(aTHX_ p_state, 0, self); /* flush */ p_state->parsing = 0; } PUSHs(self); SV* strict_comment(pstate,...) PSTATE* pstate ALIAS: HTML::Parser::strict_comment = 1 HTML::Parser::strict_names = 2 HTML::Parser::xml_mode = 3 HTML::Parser::unbroken_text = 4 HTML::Parser::marked_sections = 5 HTML::Parser::attr_encoded = 6 HTML::Parser::case_sensitive = 7 HTML::Parser::strict_end = 8 HTML::Parser::closing_plaintext = 9 HTML::Parser::utf8_mode = 10 HTML::Parser::empty_element_tags = 11 HTML::Parser::xml_pic = 12 HTML::Parser::backquote = 13 PREINIT: bool *attr; CODE: switch (ix) { case 1: attr = &pstate->strict_comment; break; case 2: attr = &pstate->strict_names; break; case 3: attr = &pstate->xml_mode; break; case 4: attr = &pstate->unbroken_text; break; case 5: #ifdef MARKED_SECTION attr = &pstate->marked_sections; break; #else croak("marked sections not supported"); break; #endif case 6: attr = &pstate->attr_encoded; break; case 7: attr = &pstate->case_sensitive; break; case 8: attr = &pstate->strict_end; break; case 9: attr = &pstate->closing_plaintext; break; #ifdef UNICODE_HTML_PARSER case 10: attr = &pstate->utf8_mode; break; #else case 10: croak("The utf8_mode does not work with this perl; perl-5.8 or better required"); #endif case 11: attr = &pstate->empty_element_tags; break; case 12: attr = &pstate->xml_pic; break; case 13: attr = &pstate->backquote; break; default: croak("Unknown boolean attribute (%d)", ix); } RETVAL = boolSV(*attr); if (items > 1) *attr = SvTRUE(ST(1)); OUTPUT: RETVAL SV* boolean_attribute_value(pstate,...) PSTATE* pstate CODE: RETVAL = pstate->bool_attr_val ? newSVsv(pstate->bool_attr_val) : &PL_sv_undef; if (items > 1) { SvREFCNT_dec(pstate->bool_attr_val); pstate->bool_attr_val = newSVsv(ST(1)); } OUTPUT: RETVAL void ignore_tags(pstate,...) PSTATE* pstate ALIAS: HTML::Parser::report_tags = 1 HTML::Parser::ignore_tags = 2 HTML::Parser::ignore_elements = 3 PREINIT: HV** attr; int i; CODE: switch (ix) { case 1: attr = &pstate->report_tags; break; case 2: attr = &pstate->ignore_tags; break; case 3: attr = &pstate->ignore_elements; break; default: croak("Unknown tag-list attribute (%d)", ix); } if (GIMME_V != G_VOID) croak("Can't report tag lists yet"); items--; /* pstate */ if (items) { if (*attr) hv_clear(*attr); else *attr = newHV(); for (i = 0; i < items; i++) { SV* sv = ST(i+1); if (SvROK(sv)) { sv = SvRV(sv); if (SvTYPE(sv) == SVt_PVAV) { AV* av = (AV*)sv; STRLEN j; STRLEN len = av_len(av) + 1; for (j = 0; j < len; j++) { SV**svp = av_fetch(av, j, 0); if (svp) { hv_store_ent(*attr, *svp, newSViv(0), 0); } } } else croak("Tag list must be plain scalars and arrays"); } else { hv_store_ent(*attr, sv, newSViv(0), 0); } } } else if (*attr) { SvREFCNT_dec(*attr); *attr = 0; } void handler(pstate, eventname,...) PSTATE* pstate SV* eventname PREINIT: STRLEN name_len; char *name = SvPV(eventname, name_len); int event = -1; int i; struct p_handler *h; PPCODE: /* map event name string to event_id */ for (i = 0; i < EVENT_COUNT; i++) { if (strEQ(name, event_id_str[i])) { event = i; break; } } if (event < 0) croak("No handler for %s events", name); h = &pstate->handlers[event]; /* set up return value */ if (h->cb) { PUSHs((SvTYPE(h->cb) == SVt_PVAV) ? sv_2mortal(newRV_inc(h->cb)) : sv_2mortal(newSVsv(h->cb))); } else { PUSHs(&PL_sv_undef); } /* update */ if (items > 3) { SvREFCNT_dec(h->argspec); h->argspec = 0; h->argspec = argspec_compile(ST(3), pstate); } if (items > 2) { SvREFCNT_dec(h->cb); h->cb = 0; h->cb = check_handler(aTHX_ ST(2)); } MODULE = HTML::Parser PACKAGE = HTML::Entities void decode_entities(...) PREINIT: int i; HV *entity2char = perl_get_hv("HTML::Entities::entity2char", FALSE); PPCODE: if (GIMME_V == G_SCALAR && items > 1) items = 1; for (i = 0; i < items; i++) { if (GIMME_V != G_VOID) ST(i) = sv_2mortal(newSVsv(ST(i))); else { #ifdef SV_CHECK_THINKFIRST SV_CHECK_THINKFIRST(ST(i)); #endif if (SvREADONLY(ST(i))) croak("Can't inline decode readonly string in decode_entities()"); } decode_entities(aTHX_ ST(i), entity2char, 0); } SP += items; void _decode_entities(string, entities, ...) SV* string SV* entities PREINIT: HV* entities_hv; bool expand_prefix = (items > 2) ? SvTRUE(ST(2)) : 0; CODE: if (SvOK(entities)) { if (SvROK(entities) && SvTYPE(SvRV(entities)) == SVt_PVHV) { entities_hv = (HV*)SvRV(entities); } else { croak("2nd argument must be hash reference"); } } else { entities_hv = 0; } #ifdef SV_CHECK_THINKFIRST SV_CHECK_THINKFIRST(string); #endif if (SvREADONLY(string)) croak("Can't inline decode readonly string in _decode_entities()"); decode_entities(aTHX_ string, entities_hv, expand_prefix); bool _probably_utf8_chunk(string) SV* string PREINIT: STRLEN len; char *s; CODE: #ifdef UNICODE_HTML_PARSER sv_utf8_downgrade(string, 0); s = SvPV(string, len); RETVAL = probably_utf8_chunk(aTHX_ s, len); #else RETVAL = 0; /* avoid never initialized complains from compiler */ croak("_probably_utf8_chunk() only works for Unicode enabled perls"); #endif OUTPUT: RETVAL int UNICODE_SUPPORT() PROTOTYPE: CODE: #ifdef UNICODE_HTML_PARSER RETVAL = 1; #else RETVAL = 0; #endif OUTPUT: RETVAL MODULE = HTML::Parser PACKAGE = HTML::Parser